Ajout d'un programme permettant de télécharger beaucoup de texte depuis wikipédia
This commit is contained in:
parent
ee818912e9
commit
f85af86a52
18
wikisourcer
Normal file
18
wikisourcer
Normal file
@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
outputfile="wikifile.ascii.txt"
|
||||
limit=10000
|
||||
|
||||
rm -f $outputfile
|
||||
touch $outputfile
|
||||
|
||||
while [ $(du --bytes $outputfile | cut -f1) -lt $limit ]
|
||||
do
|
||||
echo "Current file size: $(du --bytes $outputfile | cut -f1)"
|
||||
curl -sSL "https://fr.wikipedia.org/wiki/Sp%C3%A9cial:Page_au_hasard" \
|
||||
| awk 'BEGIN {found = 0} {if (found || $0 ~ /id=\"firstHeading\"/) {found = 1; print}}' \
|
||||
| awk 'BEGIN {found = 0} {if (found || $0 ~ /id=\"Notes\"/) {found = 1} else {print}}' \
|
||||
| trafilatura \
|
||||
| iconv -f utf-8 -t ascii//TRANSLIT \
|
||||
>> $outputfile
|
||||
done
|
||||
Loading…
x
Reference in New Issue
Block a user