-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiabetes.py
74 lines (48 loc) · 1.77 KB
/
diabetes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import requests
from bs4 import BeautifulSoup
start_url = 'https://pt.wikipedia.org/wiki/Diabetes_mellitus'
domain = 'https://pt.wikipedia.org'
def get_soup(url):
content = requests.get(url).content
return BeautifulSoup(content.decode('utf-8'), 'html.parser')
def clean_input(content):
content = bytes(content, "UTF-8")
content = content.decode("UTF-8")
sentences = content.split('. ')
return sentences
def extract_links(paragraphs):
tags = []
for paragraph in paragraphs:
tags.extend(paragraph.findAll('a'))
tags = [tag for tag in tags if 'title' in tag.attrs and 'href' in tag.attrs]
links = [tag.get('href') for tag in tags]
return links
def extract_content(url=start_url):
content = []
soup = get_soup(url)
content_items = soup.find('div', {'id': 'mw-content-text'})
[script.extract() for script in content_items.find_all('script')]
[script.extract() for script in content_items.find_all('sup')]
content_items = content_items.find_all('p')
content_links = extract_links(content_items)
for content_item in content_items:
texts = clean_input(content_item.get_text())
for text in texts:
if not text.isspace():
content.append(text + "\n")
return content, content_links
def extract_text(url=start_url):
sentences = []
content, links = extract_content(url)
sentences.extend(content)
for link in links:
print('Items : {}'.format(len(sentences)))
content, links = extract_content(domain + link)
sentences.extend(content)
if len(links) > 5000:
break
return sentences
def main():
items = extract_text(url=start_url)
with open('data/output.txt', 'w') as f:
f.writelines(items)