Skip to content

Commit 3c1d8b0

Browse files
committed
整理自己丢失 的脚本
1 parent 9cd97ef commit 3c1d8b0

4 files changed

+684
-0
lines changed

README.md

+3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
## 个人
44
- [etyma.py](src/etyma.py):查词根
5+
- [下载地址转换.py](src/下载地址转换.py):转换迅雷、QQ旋风、真实下载地址
6+
- [百度搜索图片下载脚本.py](src/百度搜索图片下载脚本.py):多线程下载百度图片搜索的结果
7+
- [下载人人相册.py](src/下载人人相册.py):多线程下载人人相册
58
- [zhihuClient.py](src/zhihuClient.py):知乎登录脚本
69
- [hexo向百度提交网址.py](src/hexo向百度提交网址.py):hexo博客专用,向百度提交网址
710
- [文件拖曳传七牛.py](src/文件拖曳传七牛.py):把文件拖到此脚本图标,自动上传到七牛云存储

src/下载人人相册.py

+351
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,351 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
# @Author: LostInNight
4+
# @Date: 2015-11-06 11:21:41
5+
# @Last Modified by: LostInNight
6+
# @Last Modified time: 2015-11-19 15:47:49
7+
8+
"""
9+
下载人人网指定用户的所有相册
10+
需要手动获取uid
11+
12+
uid说明
13+
用户主页为:http://www.renren.com/123456789/profile
14+
网页中的123456789即为该用户的uid
15+
16+
目前已经可以完成下载,加了最简陋的多线程
17+
18+
有空时再改进:
19+
1.添加线程池
20+
2.分离抓取与解析
21+
"""
22+
23+
__author__ = "LostInNight"
24+
import requests
25+
from bs4 import BeautifulSoup as BS
26+
from datetime import datetime
27+
import pdb
28+
import sys
29+
import os
30+
from multiprocessing import Queue
31+
import threading
32+
import re
33+
from collections import Counter
34+
import time
35+
36+
# 公共变量
37+
re_filename = re.compile(r'[\/:*?"<>]')
38+
re_uid_in_albums_url = re.compile(r'id=(\d+)&')
39+
re_max_page = re.compile(r'/(\d+)页')
40+
41+
home_url = r"http://3g.renren.com/"
42+
login_url = r"http://3g.renren.com/login.do?autoLogin=true&"
43+
# 用户主页,参数:uid
44+
user_url_pattern = r"http://3g.renren.com/profile.do?id={0}"
45+
# 相册列表,参数:0开始的页码、uid
46+
albums_url_pattern = r"http://3g.renren.com/album/wmyalbum.do?curpage={0}&id={1}"
47+
# 照片列表,参数:0开始的页码、相册id、uid
48+
photos_url_pattern = r"http://3g.renren.com/album/wgetalbum.do?curpage={0}&id={1}&owner={2}"
49+
50+
albums_queue = Queue() # 每个元素为Album对象
51+
photos_queue = Queue() # 每个元素为Photo对象
52+
53+
s = requests.Session()
54+
debug = True
55+
delay = 3 # 网络请求间隔,避免太快被拒
56+
lock = threading.Lock()
57+
58+
59+
def log(message):
60+
if debug:
61+
lock.acquire()
62+
try:
63+
now = str(datetime.now())
64+
index = now.rfind(":")
65+
now = now[:index + 3]
66+
print("\n%s" % now)
67+
if isinstance(message, str):
68+
message.encode("gbk", errors="ignore").decode("gbk")
69+
print(message)
70+
finally:
71+
lock.release()
72+
73+
74+
def main(username, password, uid, filepath):
75+
"""脚本方法入口
76+
77+
username 登录用户名
78+
password 密码
79+
uid 待下载的用户的uid
80+
81+
uid说明
82+
用户主页为:http://www.renren.com/123456789/profile
83+
网页中的123456789即为该用户的uid
84+
"""
85+
start_time = time.time() # 开始计时
86+
login(username, password)
87+
target_user_name = get_target_user_name(uid) # 对方的人人网名,用作文件夹
88+
filepath = os.path.join(filepath, target_user_name)
89+
if not os.path.isdir(filepath):
90+
os.mkdir(filepath)
91+
resolve_albums_queue(uid) #解析出相册列表
92+
93+
threads = []
94+
while not albums_queue.empty():
95+
album = albums_queue.get()
96+
t = threading.Thread(target=resolve_photos_queue, args=(album,))
97+
t.start()
98+
threads.append(t)
99+
for x in threads:
100+
x.join()
101+
102+
log("一共 %s 张照片" % photos_queue.qsize())
103+
log("开始下载")
104+
105+
threads.clear()
106+
while not photos_queue.empty():
107+
photo = photos_queue.get()
108+
t = threading.Thread(target=down_photo, args=(photo, filepath))
109+
t.start()
110+
threads.append(t)
111+
for x in threads:
112+
x.join()
113+
114+
used_time = trans_time(time.time() - start_time)
115+
log("下载完成,耗时:%s,请查看 %s" % (used_time, filepath))
116+
117+
118+
def get(url, binary=False):
119+
"""get请求,binary表示是否返回二进制数据"""
120+
time.sleep(delay)
121+
res = s.get(url)
122+
if binary:
123+
return res.content
124+
return res.text
125+
126+
127+
def login(username, password):
128+
"""登录手机人人网"""
129+
# 打开网页获取登录所需数据
130+
html = get(home_url)
131+
soup = BS(html, "lxml")
132+
lbskey = soup.find("input", {"name": "lbskey"})["value"]
133+
log("登录用的lbskey:%s" % lbskey)
134+
post_data = {
135+
"origURL": "",
136+
"lbskey": lbskey,
137+
"c": "",
138+
"pq": "",
139+
"appid": "",
140+
"ref": "http://m.renren.com/q.do?null",
141+
"email": username,
142+
"password": password,
143+
"login": "登录"
144+
}
145+
headers = {
146+
"Host": "3g.renren.com",
147+
"User-Agent": "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36"
148+
}
149+
html = s.post(login_url, data=post_data, headers=headers)
150+
# 失败返回网址:http://3g.renren.com/login.do?autoLogin=true&
151+
# 成功跳到网址:http://3g.renren.com/home.do?sid........
152+
if "请输入密码和验证码后登录" in html.text:
153+
verify_file = os.path.join(sys.path[0], "verify.jpg")
154+
soup = BS(html.text, "lxml")
155+
verifykey = soup.find(
156+
"input", {"type": "hidden", "name": "verifykey"})["value"]
157+
img_src = soup.find("img", {"alt": "此处为验证码"})["src"]
158+
while True:
159+
html = get(img_src, True)
160+
# 有时候获取不到图片,会显示文字,需要再刷新
161+
if not "javascript:history" in str(html):
162+
with open(verify_file, "wb") as f:
163+
f.write(html)
164+
break
165+
print("请打开 %s ,识别验证码!" % verify_file) # 必须显示,用print
166+
verifycode = input("请输入验证码:")
167+
print("继续执行")
168+
post_data["verifykey"] = verifykey
169+
post_data["verifycode"] = verifycode
170+
html = s.post(login_url, data=post_data)
171+
assert html.url.startswith(r"http://3g.renren.com/home"), "登录失败!请检查账号!"
172+
log("登录成功!")
173+
174+
175+
def get_target_user_name(uid):
176+
"""根据用户uid获取用户姓名"""
177+
html = get(user_url_pattern.format(uid))
178+
soup = BS(html, "lxml")
179+
tag = soup.find(name="div", class_="ssec")
180+
name = tag.find_next("b").string
181+
log("待抓取的用户的姓名为:%s" % str(name))
182+
return str(name)
183+
184+
185+
def resolve_albums_queue(uid):
186+
"""解析相册列表
187+
188+
uid说明
189+
用户主页为:http://www.renren.com/123456789/profile
190+
网页中的123456789即为该用户的uid
191+
192+
把结果存入Album对象,放入公共Queue albums_queue中
193+
"""
194+
# 找出相册列表页
195+
html = get(user_url_pattern.format(uid))
196+
soup = BS(html, "lxml")
197+
albums_url = soup.find(name="a", text="相册")["href"]
198+
uid = re_uid_in_albums_url.findall(albums_url)[0]
199+
max_page = get_max_page(albums_url)
200+
# 遍历所有相册
201+
for i in range(max_page):
202+
# 组装网址
203+
url = albums_url_pattern.format(i,uid)
204+
resolve_albums_page(url)
205+
log("一共 %s 个相册" % albums_queue.qsize())
206+
207+
208+
def get_max_page(url):
209+
"""解析最大页数
210+
211+
如:(第3/5页)
212+
返回类型为整数
213+
"""
214+
soup = BS(get(url), "lxml")
215+
tmp = soup.find(name = "a", title = "末页")
216+
if not tmp:
217+
max_page = 0 # 只有一页
218+
else:
219+
tmp = tmp.find_next(name = "span", class_="gray")
220+
tmp = str(tmp.string)
221+
max_page = int(re_max_page.findall(tmp)[0])
222+
return max_page
223+
224+
225+
def resolve_albums_page(url):
226+
"""传入相册列表页,解析并存入Queue"""
227+
soup = BS(get(url), "lxml")
228+
tags = soup.find_all(name="a", class_="p")
229+
for tag in tags:
230+
album_url = tag["href"]
231+
tmp = tag.find_next(name="a", href=album_url)
232+
album_name = str(tmp.string)
233+
album_update_time = str(
234+
tmp.find_next(name="span", class_="ns").string)
235+
tmpStr = "相册名:%s\n%s" % (album_name, album_update_time)
236+
log(tmpStr)
237+
albums_queue.put(Album(album_name, album_url))
238+
239+
240+
def resolve_photos_queue(album):
241+
"""传入Album对象,解析出每张照片页面
242+
243+
将结果存入Queue对象,每个元素为:(album_name, photo_page_url)
244+
"""
245+
max_page = get_max_page(album.url) # album.url即照片列表第一页
246+
for i in range(max_page):
247+
url = photos_url_pattern.format(i, album.id, album.uid)
248+
resolve_photos_page(url, album.name)
249+
log("线程 %s 已解析相册:%s" % (threading.current_thread().name, album.name))
250+
251+
252+
def resolve_photos_page(url, album_name):
253+
"""传入照片列表页,解析出每张照片即photo对象"""
254+
soup = BS(get(url), "lxml")
255+
table = soup.find(name="table", class_="p")
256+
tags = table.find_all(name="a", href=re.compile(r"^http://"))
257+
for tag in tags:
258+
photo_page_url = tag["href"]
259+
photo_url = get_photo_url(photo_page_url)
260+
photos_queue.put(Photo(album_name, photo_url, photo_page_url))
261+
262+
263+
def get_photo_url(photo_page_url):
264+
"""解析并返回每张照片的url"""
265+
soup = BS(get(photo_page_url), "lxml")
266+
tag = soup.find(name="a", text="下载")
267+
photo_url = tag["href"]
268+
return photo_url
269+
270+
271+
def down_photo(photo, filepath):
272+
"""下载照片"""
273+
# 排除不能作为文件名的字符
274+
album_name = adjust_filename(photo.album_name)
275+
photo_name = adjust_filename(photo.name)
276+
# 保存照片的文件夹
277+
filepath = os.path.join(filepath, album_name)
278+
if not os.path.isdir(filepath):
279+
os.mkdir(filepath)
280+
# 照片
281+
file = os.path.join(filepath, photo_name)
282+
with open(file, "wb") as f:
283+
html = get(photo.url, True)
284+
f.write(html)
285+
log("已下载 %s\n%s" % (file, photo.url))
286+
287+
288+
def adjust_filename(filename):
289+
"""删掉不能出现在文件名中的字符"""
290+
return re_filename.sub("", filename)
291+
292+
# 秒-->时分秒
293+
294+
295+
def trans_time(sec):
296+
hour = int(sec / 3600)
297+
sec = sec % 3600
298+
minute = int(sec / 60)
299+
sec = sec % 60
300+
return "%s小时 %s分 %s秒" % (hour, minute, sec)
301+
302+
303+
class Album(object):
304+
305+
"""相册类"""
306+
count = 0
307+
re_uid = re.compile(r'owner=(\d+)&')
308+
re_album_id = re.compile(r'id=(\d+)&')
309+
310+
def __init__(self, name, url):
311+
super(Album, self).__init__()
312+
self.name = name
313+
Album.count += 1
314+
self.uid = Album.re_uid.findall(url)[0]
315+
self.id = Album.re_album_id.findall(url)[0]
316+
# 即相册第一页,精简网址,删掉无用代码
317+
index = url.find(self.uid)
318+
self.url = url[:index+len(self.uid)]
319+
320+
321+
class Photo(object):
322+
323+
"""照片类"""
324+
count = Counter()
325+
re_uid = re.compile(r'owner=(\d+)&')
326+
re_album_id = re.compile(r'albumid=(\d+)&')
327+
re_photo_id = re.compile(r'id=(\d+)&albumid')
328+
329+
def __init__(self, album_name, photo_url, photo_page_url):
330+
super(Photo, self).__init__()
331+
self.album_name = album_name
332+
Photo.count[self.album_name] += 1
333+
self.url = photo_url # 照片网址
334+
index = self.url.rfind(".")
335+
suffix = self.url[index:]
336+
self.name = str(Photo.count[self.album_name]) + suffix
337+
self.uid = Photo.re_uid.findall(photo_page_url)[0]
338+
self.album_id = Photo.re_album_id.findall(photo_page_url)[0]
339+
self.id = Photo.re_photo_id.findall(photo_page_url)[0]
340+
# 精简网址,删掉无用代码
341+
index = photo_page_url.find(self.uid)
342+
self.page_url = photo_page_url[:index+len(self.uid)]
343+
344+
345+
if __name__ == '__main__':
346+
username = "xxx"
347+
password = "xxx"
348+
uid = "xxx"
349+
filepath = "F:\\"
350+
351+
main(username, password, uid, filepath)

0 commit comments

Comments
 (0)