Skip to content

Commit 1bf90f4

Browse files
committed
误删
1 parent b247785 commit 1bf90f4

13 files changed

+1459
-57
lines changed

.gitignore

+57-57
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,57 @@
1-
# Byte-compiled / optimized / DLL files
2-
__pycache__/
3-
*.py[cod]
4-
5-
# C extensions
6-
*.so
7-
8-
# Distribution / packaging
9-
.Python
10-
env/
11-
build/
12-
develop-eggs/
13-
dist/
14-
downloads/
15-
eggs/
16-
.eggs/
17-
lib/
18-
lib64/
19-
parts/
20-
sdist/
21-
var/
22-
*.egg-info/
23-
.installed.cfg
24-
*.egg
25-
26-
# PyInstaller
27-
# Usually these files are written by a python script from a template
28-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
29-
*.manifest
30-
*.spec
31-
32-
# Installer logs
33-
pip-log.txt
34-
pip-delete-this-directory.txt
35-
36-
# Unit test / coverage reports
37-
htmlcov/
38-
.tox/
39-
.coverage
40-
.coverage.*
41-
.cache
42-
nosetests.xml
43-
coverage.xml
44-
*,cover
45-
46-
# Translations
47-
*.mo
48-
*.pot
49-
50-
# Django stuff:
51-
*.log
52-
53-
# Sphinx documentation
54-
docs/_build/
55-
56-
# PyBuilder
57-
target/
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
5+
# C extensions
6+
*.so
7+
8+
# Distribution / packaging
9+
.Python
10+
env/
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
*.egg-info/
23+
.installed.cfg
24+
*.egg
25+
26+
# PyInstaller
27+
# Usually these files are written by a python script from a template
28+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
29+
*.manifest
30+
*.spec
31+
32+
# Installer logs
33+
pip-log.txt
34+
pip-delete-this-directory.txt
35+
36+
# Unit test / coverage reports
37+
htmlcov/
38+
.tox/
39+
.coverage
40+
.coverage.*
41+
.cache
42+
nosetests.xml
43+
coverage.xml
44+
*,cover
45+
46+
# Translations
47+
*.mo
48+
*.pot
49+
50+
# Django stuff:
51+
*.log
52+
53+
# Sphinx documentation
54+
docs/_build/
55+
56+
# PyBuilder
57+
target/

src/BaiduImgDown.py

+242
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
# @Author: loveNight
4+
# @Date: 2015-10-28 19:59:24
5+
# @Last Modified by: loveNight
6+
# @Last Modified time: 2015-11-15 19:43:20
7+
8+
9+
import urllib
10+
import requests
11+
import os
12+
import re
13+
import sys
14+
import time
15+
import threading
16+
from datetime import datetime as dt
17+
from multiprocessing.dummy import Pool
18+
from multiprocessing import Queue
19+
20+
21+
class BaiduImgDownloader(object):
22+
23+
"""百度图片搜索下载工具,目前只支持单个关键词"""
24+
25+
# 解码网址用的映射表
26+
str_table = {
27+
'_z2C$q': ':',
28+
'_z&e3B': '.',
29+
'AzdH3F': '/'
30+
}
31+
32+
char_table = {
33+
'w': 'a',
34+
'k': 'b',
35+
'v': 'c',
36+
'1': 'd',
37+
'j': 'e',
38+
'u': 'f',
39+
'2': 'g',
40+
'i': 'h',
41+
't': 'i',
42+
'3': 'j',
43+
'h': 'k',
44+
's': 'l',
45+
'4': 'm',
46+
'g': 'n',
47+
'5': 'o',
48+
'r': 'p',
49+
'q': 'q',
50+
'6': 'r',
51+
'f': 's',
52+
'p': 't',
53+
'7': 'u',
54+
'e': 'v',
55+
'o': 'w',
56+
'8': '1',
57+
'd': '2',
58+
'n': '3',
59+
'9': '4',
60+
'c': '5',
61+
'm': '6',
62+
'0': '7',
63+
'b': '8',
64+
'l': '9',
65+
'a': '0'
66+
}
67+
68+
re_objURL = re.compile(r'"objURL":"(.*?)".*?"type":"(.*?)"')
69+
re_downNum = re.compile(r"已下载\s(\d+)\s张图片")
70+
headers = {
71+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36",
72+
"Accept-Encoding": "gzip, deflate, sdch",
73+
}
74+
75+
def __init__(self, word, dirpath=None, processNum=30):
76+
if " " in word:
77+
raise AttributeError("本脚本仅支持单个关键字")
78+
self.word = word
79+
self.char_table = {ord(key): ord(value)
80+
for key, value in BaiduImgDownloader.char_table.items()}
81+
if not dirpath:
82+
dirpath = os.path.join(sys.path[0], 'results')
83+
self.dirpath = dirpath
84+
self.jsonUrlFile = os.path.join(sys.path[0], 'jsonUrl.txt')
85+
self.logFile = os.path.join(sys.path[0], 'logInfo.txt')
86+
self.errorFile = os.path.join(sys.path[0], 'errorUrl.txt')
87+
if os.path.exists(self.errorFile):
88+
os.remove(self.errorFile)
89+
if not os.path.exists(self.dirpath):
90+
os.mkdir(self.dirpath)
91+
self.pool = Pool(30)
92+
self.session = requests.Session()
93+
self.session.headers = BaiduImgDownloader.headers
94+
self.queue = Queue()
95+
self.messageQueue = Queue()
96+
self.index = 0 # 图片起始编号,牵涉到计数,不要更改
97+
self.promptNum = 10 # 下载几张图片提示一次
98+
self.lock = threading.Lock()
99+
self.delay = 1.5 # 网络请求太频繁会被封
100+
self.QUIT = "QUIT" # Queue中表示任务结束
101+
self.printPrefix = "**" # 用于指定在控制台输出
102+
103+
def start(self):
104+
# 控制台输出线程
105+
t = threading.Thread(target=self.__log)
106+
t.setDaemon(True)
107+
t.start()
108+
self.messageQueue.put(self.printPrefix + "脚本开始执行")
109+
start_time = dt.now()
110+
urls = self.__buildUrls()
111+
self.messageQueue.put(self.printPrefix + "已获取 %s 个Json请求网址" % len(urls))
112+
# 解析出所有图片网址,该方法会阻塞直到任务完成
113+
self.pool.map(self.__resolveImgUrl, urls)
114+
while self.queue.qsize():
115+
imgs = self.queue.get()
116+
self.pool.map_async(self.__downImg, imgs)
117+
self.pool.close()
118+
self.pool.join()
119+
self.messageQueue.put(self.printPrefix + "下载完成!已下载 %s 张图片,总用时 %s" %
120+
(self.index, dt.now() - start_time))
121+
self.messageQueue.put(self.printPrefix + "请到 %s 查看结果!" % self.dirpath)
122+
self.messageQueue.put(self.printPrefix + "错误信息保存在 %s" % self.errorFile)
123+
self.messageQueue.put(self.QUIT)
124+
125+
126+
def __log(self):
127+
"""控制台输出,加锁以免被多线程打乱"""
128+
with open(self.logFile, "w", encoding = "utf-8") as f:
129+
while True:
130+
message = self.messageQueue.get()
131+
if message == self.QUIT:
132+
break
133+
message = str(dt.now()) + " " + message
134+
if self.printPrefix in message:
135+
print(message)
136+
elif "已下载" in message:
137+
# 下载N张图片提示一次
138+
downNum = self.re_downNum.findall(message)
139+
if downNum and int(downNum[0]) % self.promptNum == 0:
140+
print(message)
141+
f.write(message + '\n')
142+
f.flush()
143+
144+
def __getIndex(self):
145+
"""获取文件编号"""
146+
self.lock.acquire()
147+
try:
148+
return self.index
149+
finally:
150+
self.index += 1
151+
self.lock.release()
152+
153+
def decode(self, url):
154+
"""解码图片URL
155+
解码前:
156+
ippr_z2C$qAzdH3FAzdH3Ffl_z&e3Bftgwt42_z&e3BvgAzdH3F4omlaAzdH3Faa8W3ZyEpymRmx3Y1p7bb&mla
157+
解码后:
158+
http://s9.sinaimg.cn/mw690/001WjZyEty6R6xjYdtu88&690
159+
"""
160+
# 先替换字符串
161+
for key, value in self.str_table.items():
162+
url = url.replace(key, value)
163+
# 再替换剩下的字符
164+
return url.translate(self.char_table)
165+
166+
def __buildUrls(self):
167+
"""json请求网址生成器"""
168+
word = urllib.parse.quote(self.word)
169+
url = r"http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&st=-1&ic=0&word={word}&face=0&istype=2nc=1&pn={pn}&rn=60"
170+
time.sleep(self.delay)
171+
html = self.session.get(url.format(word=word, pn=0), timeout = 15).content.decode('utf-8')
172+
results = re.findall(r'"displayNum":(\d+),', html)
173+
maxNum = int(results[0]) if results else 0
174+
urls = [url.format(word=word, pn=x)
175+
for x in range(0, maxNum + 1, 60)]
176+
with open(self.jsonUrlFile, "w", encoding="utf-8") as f:
177+
for url in urls:
178+
f.write(url + "\n")
179+
return urls
180+
181+
def __resolveImgUrl(self, url):
182+
"""从指定网页中解析出图片URL"""
183+
time.sleep(self.delay)
184+
html = self.session.get(url, timeout = 15).content.decode('utf-8')
185+
datas = self.re_objURL.findall(html)
186+
imgs = [Image(self.decode(x[0]), x[1]) for x in datas]
187+
self.messageQueue.put(self.printPrefix + "已解析出 %s 个图片网址" % len(imgs))
188+
self.queue.put(imgs)
189+
190+
def __downImg(self, img):
191+
"""下载单张图片,传入的是Image对象"""
192+
imgUrl = img.url
193+
# self.messageQueue.put("线程 %s 正在下载 %s " %
194+
# (threading.current_thread().name, imgUrl))
195+
try:
196+
time.sleep(self.delay)
197+
res = self.session.get(imgUrl, timeout = 15)
198+
message = None
199+
if str(res.status_code)[0] == "4":
200+
message = "\n%s: %s" % (res.status_code, imgUrl)
201+
elif "text/html" in res.headers["Content-Type"]:
202+
message = "\n无法打开图片: %s" % imgUrl
203+
except Exception as e:
204+
message = "\n抛出异常: %s\n%s" % (imgUrl, str(e))
205+
finally:
206+
if message:
207+
self.messageQueue.put(message)
208+
self.__saveError(message)
209+
return
210+
index = self.__getIndex()
211+
# index从0开始
212+
self.messageQueue.put("已下载 %s 张图片:%s" % (index + 1, imgUrl))
213+
filename = os.path.join(self.dirpath, str(index) + "." + img.type)
214+
with open(filename, "wb") as f:
215+
f.write(res.content)
216+
217+
def __saveError(self, message):
218+
self.lock.acquire()
219+
try:
220+
with open(self.errorFile, "a", encoding="utf-8") as f:
221+
f.write(message)
222+
finally:
223+
self.lock.release()
224+
225+
226+
class Image(object):
227+
228+
"""图片类,保存图片信息"""
229+
230+
def __init__(self, url, type):
231+
super(Image, self).__init__()
232+
self.url = url
233+
self.type = type
234+
235+
236+
if __name__ == '__main__':
237+
print("欢迎使用百度图片下载脚本!\n目前仅支持单个关键词。")
238+
print("下载结果保存在脚本目录下的results文件夹中。")
239+
print("=" * 50)
240+
word = input("请输入你要下载的图片关键词:\n")
241+
down = BaiduImgDownloader(word)
242+
down.start()

src/chromeHeaders2Python.py

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
# @Author: LostInNight
4+
# @Date: 2015-11-03 20:02:27
5+
# @Last Modified by: LostInNight
6+
# @Last Modified time: 2015-11-03 20:07:56
7+
8+
9+
# Chrome 截获的Headers转化为Python可用的dict,输出未排版
10+
11+
s = '''Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
12+
Accept-Encoding:gzip, deflate, sdch
13+
Accept-Language:zh-CN,zh;q=0.8
14+
Connection:keep-alive
15+
Cookie:x-wl-uid=16g0A+YFYxJfKEtYdu3jnjT5fyfFfYsVwN90XyuIvu1ooEwLD0rk+ETnyp1GE4/LFXUyqbPPCNSE=; session-token=dL8l1xW56RLj3F3P2p6szy6b7MH7R3o1EosxprqS0ci8JaUN9kJ212ybOIEXgV/Iif/rN2LFToQlyJyV/Q69NLnbRfvI4qArBPLDRJoXb+tjUsrF+z7yXznFtEFiQ52O2lc2k1tOTsKWX4u1h7N5W/O1Y2jFF0XMB2Somg9zeSPrTPTJyVBoWl3M+M5lW7S3vX1BueyzUJKPorQ+z3lXsQ98os2NrZwhLpndF14RkKNbuRPkKluj+Q==; session-id-time=2082729601l; session-id=476-5989535-3128668; csm-hit=0GASN13S1NXJ6XZDVCSF+sa-1K96W079F5G590GEQAK7-1FZS7C1YHF9SGAH3Q9W4|1446303517108; ubid-acbcn=475-2664832-5552339
16+
Host:www.amazon.cn
17+
Upgrade-Insecure-Requests:1
18+
User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36
19+
'''
20+
s = s.strip().split('\n')
21+
s = {x.split(':')[0] : x.split(':')[1] for x in s}
22+
print(s)

0 commit comments

Comments
 (0)