TIPE2021/wikisourcer

19 lines
554 B
Bash

#!/bin/bash
outputfile="wikifile.ascii.txt"
limit=10000
rm -f $outputfile
touch $outputfile
while [ $(du --bytes $outputfile | cut -f1) -lt $limit ]
do
echo "Current file size: $(du --bytes $outputfile | cut -f1)"
curl -sSL "https://fr.wikipedia.org/wiki/Sp%C3%A9cial:Page_au_hasard" \
| awk 'BEGIN {found = 0} {if (found || $0 ~ /id=\"firstHeading\"/) {found = 1; print}}' \
| awk 'BEGIN {found = 0} {if (found || $0 ~ /id=\"Notes\"/) {found = 1} else {print}}' \
| trafilatura \
| iconv -f utf-8 -t ascii//TRANSLIT \
>> $outputfile
done