-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstartUp.py
169 lines (138 loc) · 4.77 KB
/
startUp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import random
from app.QuanBenMianFeiCrawler import QuanBenCrawler
from app.QuanbenUpdater import QuanBenUpdater
from app.ShuqiCrawler import ShuqiCrawler
from app.ZhuiShuShenQiCrawler import ZssqCrawler
from app.ershoufang import Ershoufang
from app.mianfeiTXTCrawler import MianFeiTXTCrawler
from app.mianfeiTXTUpdater import MianFeiTXTUpdater
from app.shuqi import start
# from app.shuqiNewFilder import ShuqiFilder
from app.shuqiUpdater import ShuqiUpdater
from dao.dushuService import loadExistsSQId
from local.shuqi.shuqiLocal import loadShuQC
from manager.Manager import Manager, crawlManager
from rest.restServices import WebServer
def shuqiTest():
# updateCapDigest()
# http: // api.shuqireader.com / reader / bc_cover.php?bookId = 93511
# handleWebsiteNoise(581398, 582410)
import sys
bloom = loadExistsSQId()
# shuqCategory2 = loadShuQC()
db_dushu = 'default'
db_acticle = 'default'
st = 10000
end = 7000000
if len(sys.argv) > 1:
st = int(sys.argv[1])
end = int(sys.argv[2])
if len(sys.argv) > 3:
db_dushu = sys.argv[3]
db_acticle = sys.argv[4]
# 加载已入库的章节digest
# shuqiAddInit()
# nullBookIds = open('nullSQID.txt', 'r')
# nullIdSet = set()
# while 1:
# sqid = nullBookIds.readline()
# if not sqid:
# break
# nullIdSet.add(int(sqid.replace('\n', '')))
# st = 10000
# end = 30000
# uploadCapByCid(int(sys.argv[1]))
# uploadCapFromTo(649943, 650090)
# uploadCapFromTo(int(sys.argv[1]), int(sys.argv[2]))
# seq = range(st, end)
print 'start from shuqi id ', st, ' to ', end, '; insert into ', db_dushu, ', and ', db_acticle
idx = st
carry = 10000
while idx < end:
# seq = range(5000, 6000)
seq = range(idx, idx + carry)
random.shuffle(seq)
#
for sqBid in seq:
# print sqBid
# if sqBid in nullIdSet:
# continue
if not 'shuqi' + str(sqBid) in bloom:
try:
start(3648845)
except Exception as e:
print sqBid, ': ', e
except IOError as e2:
print sqBid, ': ', e2
bloom.add('shuqi' + str(sqBid))
idx = idx + carry
# dumpBloomToFile(donedegest)
# start(5837744, shuqCategory2)
# start(115468,shuqCategory2)
# shuqiAddInitTmp()
# startFromCId()
# shuqiAddInit()
# miss = open('missBookId.txt', 'r')
# while 1:
# line = miss.readline()
# if not line:
# break
# lineArr = line.split(',')
# bookId = lineArr[0]
# csor2.execute('select rawUrl from cn_dushu_book where id = %s', (bookId,))
# conn2.commit()
# rawUrl = csor2.fetchone()[0]
# findex = rawUrl.find('bookId=') + 7
# if len(rawUrl) - findex > 7:
# print bookId
# continue
# shuqiId = rawUrl[findex:]
# start(shuqiId, shuqCategory2)
# f = open('shuqiBookId.log', 'r')
# f.readline()
# while 1:
# id = f.readline()
# if not id:
# break
# id = id.replace('\n', '')
# start(id, shuqCategory2)
# from multiprocessing import Pool
#
# manager = multiprocessing.Manager()
#
# # 父进程创建Queue,并传给各个子进程:
# queue = manager.Queue(maxsize=100)
#
# p = Pool(5)
#
# p.apply_async(onlyInsertCap, args=(queue,))
# # p.apply_async(onlyInsertCap, args=(queue,))
# # p.apply_async(onlyInsertCap, args=(queue,))
# #
# startFromCId(p,queue)
# p.close()
# p.join()
# ids = '6692553,4871569,5067938,57392,51602'
# for bookId in ids.split(','):
# start(bookId, shuqCategory2)
# startFromLatestAjax()
if __name__ == '__main__':
crawlManager.crawlers['ershoufang'] = Ershoufang
crawlManager.crawlers['shuqiById'] = ShuqiCrawler
# crawlManager.crawlers['shuqiFilder'] = ShuqiFilder
crawlManager.crawlers['mianFeiTXT'] = MianFeiTXTCrawler
crawlManager.crawlers['mianFeiTXTUpdater'] = MianFeiTXTUpdater
crawlManager.crawlers['shuqiUpdater'] = ShuqiUpdater
crawlManager.crawlers['zssqCrawler'] = ZssqCrawler
crawlManager.crawlers['quanBenCrawler'] = QuanBenCrawler
crawlManager.crawlers['quanBenUpdater'] = QuanBenUpdater
webApp = WebServer()
webApp.run(port=10008)
# manager = Manager()
#
# ershoufangCrawler = Ershoufang()
# manager.addCrawler(ershoufangCrawler)
#
# manager.start()