From f85af86a527b2bb0bcadcf5e7ce37f0a20efeef3 Mon Sep 17 00:00:00 2001 From: MysaaJava Date: Thu, 12 Nov 2020 19:46:44 +0100 Subject: [PATCH] =?UTF-8?q?Ajout=20d'un=20programme=20permettant=20de=20t?= =?UTF-8?q?=C3=A9l=C3=A9charger=20beaucoup=20de=20texte=20depuis=20wikip?= =?UTF-8?q?=C3=A9dia?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- wikisourcer | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 wikisourcer diff --git a/wikisourcer b/wikisourcer new file mode 100644 index 0000000..f90b475 --- /dev/null +++ b/wikisourcer @@ -0,0 +1,18 @@ +#!/bin/bash + +outputfile="wikifile.ascii.txt" +limit=10000 + +rm -f $outputfile +touch $outputfile + +while [ $(du --bytes $outputfile | cut -f1) -lt $limit ] +do + echo "Current file size: $(du --bytes $outputfile | cut -f1)" + curl -sSL "https://fr.wikipedia.org/wiki/Sp%C3%A9cial:Page_au_hasard" \ + | awk 'BEGIN {found = 0} {if (found || $0 ~ /id=\"firstHeading\"/) {found = 1; print}}' \ + | awk 'BEGIN {found = 0} {if (found || $0 ~ /id=\"Notes\"/) {found = 1} else {print}}' \ + | trafilatura \ + | iconv -f utf-8 -t ascii//TRANSLIT \ + >> $outputfile +done