-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy path3B_fetch_text_random.py
56 lines (43 loc) · 1.45 KB
/
3B_fetch_text_random.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
Scrape random Wikipedia articles.
This is probably the most simple way to get a large number of articles from Wikipedia in every language.
Most of these articles are very short and lower quality.
"""
import os
import timeit
from slugify import slugify
import wikipedia
LANGUAGES = [
"it",
"nl",
"cz",
"se",
"no",
"fi",
]
start = timeit.default_timer()
def make_directory(language):
if not os.path.exists(language):
os.makedirs(language)
# def write_file(title):
# text = wikipedia.WikipediaPage(title).content
# file = open('text/' + LANGUAGE + '/' + slugify(title) + '.txt', 'w')
# file.write(text)
for LANGUAGE in LANGUAGES:
make_directory("text/" + LANGUAGE)
wikipedia.set_lang(LANGUAGE)
print(LANGUAGE)
for _ in range(100):
for title in wikipedia.random(pages=10):
try:
text = wikipedia.WikipediaPage(title).content
file = open("text/" + LANGUAGE + "/" + slugify(title) + ".txt", "w")
file.write(text)
except wikipedia.exceptions.DisambiguationError as e:
# for title in e.options:
# text = wikipedia.WikipediaPage(title).content
# file = open('text/' + LANGUAGE + '/' + slugify(title) + '.txt', 'w')
# file.write(text)
print("exception.DisambiguationError")
stop = timeit.default_timer()
print("Execution Time: ", stop - start)