Ajout d'un programme permettant de télécharger beaucoup de texte depuis wikipédia

This commit is contained in:
MysaaJava 2020-11-12 19:46:44 +01:00
parent ee818912e9
commit f85af86a52

18
wikisourcer Normal file
View File

@ -0,0 +1,18 @@
#!/bin/bash
outputfile="wikifile.ascii.txt"
limit=10000
rm -f $outputfile
touch $outputfile
while [ $(du --bytes $outputfile | cut -f1) -lt $limit ]
do
echo "Current file size: $(du --bytes $outputfile | cut -f1)"
curl -sSL "https://fr.wikipedia.org/wiki/Sp%C3%A9cial:Page_au_hasard" \
| awk 'BEGIN {found = 0} {if (found || $0 ~ /id=\"firstHeading\"/) {found = 1; print}}' \
| awk 'BEGIN {found = 0} {if (found || $0 ~ /id=\"Notes\"/) {found = 1} else {print}}' \
| trafilatura \
| iconv -f utf-8 -t ascii//TRANSLIT \
>> $outputfile
done