-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathresources.py
96 lines (84 loc) · 3.01 KB
/
resources.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# coding: utf-8
import os
import time
import random
import requests
import pdfkit
from html.parser import HTMLParser
class MLStripper(HTMLParser, object):
def __init__(self):
super(MLStripper, self).__init__()
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def get_article_url_list(query):
return_list = []
url = 'http://m.biz.chosun.com/svc/searchData.html?query={}'.format(query)
headers = {
'Content-Type': 'application/json; charset=utf-8',
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.44 Safari/534.7',
}
r = requests.get(url, headers=headers)
tot_article_count = int(r.json().get('TOTCNT'))
for pn in range(1, int(tot_article_count / 10) + 2):
# sleep for a range 1 ~ 4 secs
_sleep_time = float(random.randrange(10, 40)) / 10
print ('[{}/{} waiting: {}s] 읽는 중입니다..'.format(pn,
int(tot_article_count / 10) + 1,
_sleep_time))
time.sleep(_sleep_time)
r = requests.get(url + '&pn={}'.format(pn), headers=headers)
contents = r.json().get('CONTENT')
if contents:
return_list.extend(contents)
else:
print ('[WARNING] no contents in {}&pn={}'.format(url, pn))
return return_list
def save_to_pdf(cont_id, directory, title=None):
# basic settings
url = 'http://m.biz.chosun.com/svc/article.html?contid={}'.format(cont_id)
options = {
'footer-right': 'Page [page] of [toPage]',
'page-size': 'A4',
'margin-top': '0.40in',
'margin-bottom': '0.40in',
'margin-right': '0.40in',
'margin-left': '0.40in',
'encoding': "UTF-8",
'no-outline': None,
'dpi': 750,
'load-media-error-handling': 'ignore',
'load-error-handling': 'ignore',
}
# if you need Table of Contents, then uncomment under 4 lines.
toc = None
# toc = {
# 'load-media-error-handling': 'ignore',
# 'load-error-handling': 'ignore'
# }
config = pdfkit.configuration(wkhtmltopdf='/usr/local/bin/wkhtmltopdf')
if title is not None:
# title strip
file_name = title
else:
file_name = cont_id
# if same file already exists,
if os.path.exists('{}/{}.pdf'.format(directory, file_name)):
print ('[WARNING] file {}.pdf already exists.'.format(file_name))
return False
else:
print ('[NOTE] file {}.pdf ready to make..'.format(file_name))
# html url to pdf file
pdfkit.from_url(url=url,
configuration=config,
output_path='./{}/{}.pdf'.format(directory, file_name),
options=options,
toc=toc)
return True