Ajout d'un programme permettant de télécharger beaucoup de texte depuis wikipédia
This commit is contained in:
parent
ee818912e9
commit
f85af86a52
18
wikisourcer
Normal file
18
wikisourcer
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
outputfile="wikifile.ascii.txt"
|
||||||
|
limit=10000
|
||||||
|
|
||||||
|
rm -f $outputfile
|
||||||
|
touch $outputfile
|
||||||
|
|
||||||
|
while [ $(du --bytes $outputfile | cut -f1) -lt $limit ]
|
||||||
|
do
|
||||||
|
echo "Current file size: $(du --bytes $outputfile | cut -f1)"
|
||||||
|
curl -sSL "https://fr.wikipedia.org/wiki/Sp%C3%A9cial:Page_au_hasard" \
|
||||||
|
| awk 'BEGIN {found = 0} {if (found || $0 ~ /id=\"firstHeading\"/) {found = 1; print}}' \
|
||||||
|
| awk 'BEGIN {found = 0} {if (found || $0 ~ /id=\"Notes\"/) {found = 1} else {print}}' \
|
||||||
|
| trafilatura \
|
||||||
|
| iconv -f utf-8 -t ascii//TRANSLIT \
|
||||||
|
>> $outputfile
|
||||||
|
done
|
||||||
Loading…
x
Reference in New Issue
Block a user