-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathl-ukrainka_name.py
executable file
·105 lines (88 loc) · 3.12 KB
/
l-ukrainka_name.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
# import csv
from bs4 import BeautifulSoup
# from urllib.request import urlopen
from urllib2 import urlopen
import codecs
import re
import unicodecsv as csv
base_urls = ["http://www.l-ukrainka.name/uk/Prose.html", "http://www.l-ukrainka.name/uk/Publicistics.html",
"http://www.l-ukrainka.name/uk/Criticism.html", "http://www.l-ukrainka.name/uk/Corresp.html"]
author = u"Леся Українка"
def clear_unicode_string(s):
control_chars = ''.join(map(unichr, range(0, 32) + range(127, 160)))
control_char_re = re.compile('[%s]' % re.escape(control_chars))
return control_char_re.sub('', s)
def do_bs4(url):
name = re.sub("^.*/([^.]*).*", "\\1", url)
print name
name = "LU/" + name
if not os.path.exists(name): os.makedirs(name)
response = urlopen(url)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all("p", class_="BT")
menu_links = soup.find_all("p", class_="Menu1")
# remove links which don't actually have links before counting
for link in links:
aelt = link.find("a")
if not aelt:
links.remove(link)
if len(menu_links) > len(links):
links = menu_links
for link in links:
aelt = link.find("a")
if not aelt:
continue # some texts are missing?
if 'title' in aelt.attrs:
linkname = clear_unicode_string(aelt['title'] + aelt.next_sibling.string) # extract silly date
else:
linkname = aelt.text
if linkname == '+':
linkname = aelt.parent.next_sibling.text
process_item(linkname, aelt['href'], name)
def get_text(body_elt):
text = body_elt.text
idx = text.rfind(u"Примітки")
if idx == -1:
idx = text.rfind(u"Попередній твір")
return text[:idx]
def get_body_and_menu_elts(url):
response = urlopen(url)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
body = soup.find("td", id="Stuff")
menu = soup.find("div", class_="TreeDiv")
return body, menu
def process_item(title, url, section_path):
body, menu = get_body_and_menu_elts(url)
# see if this is a multi-part page
links = menu.find_all("p", class_="Menu")
if not links:
text = get_text(body)
else:
print "multi part", title
text = ""
for link in links:
aelt = link.find("a")
subbody, _ = get_body_and_menu_elts(aelt['href'])
text += get_text(subbody)
# silly way to extract year from the title
date = re.findall("[0-9]{4}", title)
if date:
date = date[0]
else:
date = ""
print title, date
filename = section_path + "/" + title + ".txt"
with codecs.open(filename, "w", "utf-8-sig") as outfile:
outfile.write(text)
with open(author+".csv", "a") as csvfile:
csvwriter = csv.writer(csvfile, encoding='utf-8')
csvwriter.writerow([author, title, filename, date, len(text)])
if __name__ == "__main__":
# do_bs4(base_urls[3])
for url in base_urls:
do_bs4(url)