|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +# @Author: LostInNight |
| 4 | +# @Date: 2015-11-06 11:21:41 |
| 5 | +# @Last Modified by: LostInNight |
| 6 | +# @Last Modified time: 2015-11-19 15:47:49 |
| 7 | + |
| 8 | +""" |
| 9 | +下载人人网指定用户的所有相册 |
| 10 | +需要手动获取uid |
| 11 | +
|
| 12 | +uid说明 |
| 13 | + 用户主页为:http://www.renren.com/123456789/profile |
| 14 | + 网页中的123456789即为该用户的uid |
| 15 | +
|
| 16 | +目前已经可以完成下载,加了最简陋的多线程 |
| 17 | +
|
| 18 | +有空时再改进: |
| 19 | +1.添加线程池 |
| 20 | +2.分离抓取与解析 |
| 21 | +""" |
| 22 | + |
| 23 | +__author__ = "LostInNight" |
| 24 | +import requests |
| 25 | +from bs4 import BeautifulSoup as BS |
| 26 | +from datetime import datetime |
| 27 | +import pdb |
| 28 | +import sys |
| 29 | +import os |
| 30 | +from multiprocessing import Queue |
| 31 | +import threading |
| 32 | +import re |
| 33 | +from collections import Counter |
| 34 | +import time |
| 35 | + |
| 36 | +# 公共变量 |
| 37 | +re_filename = re.compile(r'[\/:*?"<>]') |
| 38 | +re_uid_in_albums_url = re.compile(r'id=(\d+)&') |
| 39 | +re_max_page = re.compile(r'/(\d+)页') |
| 40 | + |
| 41 | +home_url = r"http://3g.renren.com/" |
| 42 | +login_url = r"http://3g.renren.com/login.do?autoLogin=true&" |
| 43 | +# 用户主页,参数:uid |
| 44 | +user_url_pattern = r"http://3g.renren.com/profile.do?id={0}" |
| 45 | +# 相册列表,参数:0开始的页码、uid |
| 46 | +albums_url_pattern = r"http://3g.renren.com/album/wmyalbum.do?curpage={0}&id={1}" |
| 47 | +# 照片列表,参数:0开始的页码、相册id、uid |
| 48 | +photos_url_pattern = r"http://3g.renren.com/album/wgetalbum.do?curpage={0}&id={1}&owner={2}" |
| 49 | + |
| 50 | +albums_queue = Queue() # 每个元素为Album对象 |
| 51 | +photos_queue = Queue() # 每个元素为Photo对象 |
| 52 | + |
| 53 | +s = requests.Session() |
| 54 | +debug = True |
| 55 | +delay = 3 # 网络请求间隔,避免太快被拒 |
| 56 | +lock = threading.Lock() |
| 57 | + |
| 58 | + |
| 59 | +def log(message): |
| 60 | + if debug: |
| 61 | + lock.acquire() |
| 62 | + try: |
| 63 | + now = str(datetime.now()) |
| 64 | + index = now.rfind(":") |
| 65 | + now = now[:index + 3] |
| 66 | + print("\n%s" % now) |
| 67 | + if isinstance(message, str): |
| 68 | + message.encode("gbk", errors="ignore").decode("gbk") |
| 69 | + print(message) |
| 70 | + finally: |
| 71 | + lock.release() |
| 72 | + |
| 73 | + |
| 74 | +def main(username, password, uid, filepath): |
| 75 | + """脚本方法入口 |
| 76 | +
|
| 77 | + username 登录用户名 |
| 78 | + password 密码 |
| 79 | + uid 待下载的用户的uid |
| 80 | +
|
| 81 | + uid说明 |
| 82 | + 用户主页为:http://www.renren.com/123456789/profile |
| 83 | + 网页中的123456789即为该用户的uid |
| 84 | + """ |
| 85 | + start_time = time.time() # 开始计时 |
| 86 | + login(username, password) |
| 87 | + target_user_name = get_target_user_name(uid) # 对方的人人网名,用作文件夹 |
| 88 | + filepath = os.path.join(filepath, target_user_name) |
| 89 | + if not os.path.isdir(filepath): |
| 90 | + os.mkdir(filepath) |
| 91 | + resolve_albums_queue(uid) #解析出相册列表 |
| 92 | + |
| 93 | + threads = [] |
| 94 | + while not albums_queue.empty(): |
| 95 | + album = albums_queue.get() |
| 96 | + t = threading.Thread(target=resolve_photos_queue, args=(album,)) |
| 97 | + t.start() |
| 98 | + threads.append(t) |
| 99 | + for x in threads: |
| 100 | + x.join() |
| 101 | + |
| 102 | + log("一共 %s 张照片" % photos_queue.qsize()) |
| 103 | + log("开始下载") |
| 104 | + |
| 105 | + threads.clear() |
| 106 | + while not photos_queue.empty(): |
| 107 | + photo = photos_queue.get() |
| 108 | + t = threading.Thread(target=down_photo, args=(photo, filepath)) |
| 109 | + t.start() |
| 110 | + threads.append(t) |
| 111 | + for x in threads: |
| 112 | + x.join() |
| 113 | + |
| 114 | + used_time = trans_time(time.time() - start_time) |
| 115 | + log("下载完成,耗时:%s,请查看 %s" % (used_time, filepath)) |
| 116 | + |
| 117 | + |
| 118 | +def get(url, binary=False): |
| 119 | + """get请求,binary表示是否返回二进制数据""" |
| 120 | + time.sleep(delay) |
| 121 | + res = s.get(url) |
| 122 | + if binary: |
| 123 | + return res.content |
| 124 | + return res.text |
| 125 | + |
| 126 | + |
| 127 | +def login(username, password): |
| 128 | + """登录手机人人网""" |
| 129 | + # 打开网页获取登录所需数据 |
| 130 | + html = get(home_url) |
| 131 | + soup = BS(html, "lxml") |
| 132 | + lbskey = soup.find("input", {"name": "lbskey"})["value"] |
| 133 | + log("登录用的lbskey:%s" % lbskey) |
| 134 | + post_data = { |
| 135 | + "origURL": "", |
| 136 | + "lbskey": lbskey, |
| 137 | + "c": "", |
| 138 | + "pq": "", |
| 139 | + "appid": "", |
| 140 | + "ref": "http://m.renren.com/q.do?null", |
| 141 | + "email": username, |
| 142 | + "password": password, |
| 143 | + "login": "登录" |
| 144 | + } |
| 145 | + headers = { |
| 146 | + "Host": "3g.renren.com", |
| 147 | + "User-Agent": "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36" |
| 148 | + } |
| 149 | + html = s.post(login_url, data=post_data, headers=headers) |
| 150 | + # 失败返回网址:http://3g.renren.com/login.do?autoLogin=true& |
| 151 | + # 成功跳到网址:http://3g.renren.com/home.do?sid........ |
| 152 | + if "请输入密码和验证码后登录" in html.text: |
| 153 | + verify_file = os.path.join(sys.path[0], "verify.jpg") |
| 154 | + soup = BS(html.text, "lxml") |
| 155 | + verifykey = soup.find( |
| 156 | + "input", {"type": "hidden", "name": "verifykey"})["value"] |
| 157 | + img_src = soup.find("img", {"alt": "此处为验证码"})["src"] |
| 158 | + while True: |
| 159 | + html = get(img_src, True) |
| 160 | + # 有时候获取不到图片,会显示文字,需要再刷新 |
| 161 | + if not "javascript:history" in str(html): |
| 162 | + with open(verify_file, "wb") as f: |
| 163 | + f.write(html) |
| 164 | + break |
| 165 | + print("请打开 %s ,识别验证码!" % verify_file) # 必须显示,用print |
| 166 | + verifycode = input("请输入验证码:") |
| 167 | + print("继续执行") |
| 168 | + post_data["verifykey"] = verifykey |
| 169 | + post_data["verifycode"] = verifycode |
| 170 | + html = s.post(login_url, data=post_data) |
| 171 | + assert html.url.startswith(r"http://3g.renren.com/home"), "登录失败!请检查账号!" |
| 172 | + log("登录成功!") |
| 173 | + |
| 174 | + |
| 175 | +def get_target_user_name(uid): |
| 176 | + """根据用户uid获取用户姓名""" |
| 177 | + html = get(user_url_pattern.format(uid)) |
| 178 | + soup = BS(html, "lxml") |
| 179 | + tag = soup.find(name="div", class_="ssec") |
| 180 | + name = tag.find_next("b").string |
| 181 | + log("待抓取的用户的姓名为:%s" % str(name)) |
| 182 | + return str(name) |
| 183 | + |
| 184 | + |
| 185 | +def resolve_albums_queue(uid): |
| 186 | + """解析相册列表 |
| 187 | +
|
| 188 | + uid说明 |
| 189 | + 用户主页为:http://www.renren.com/123456789/profile |
| 190 | + 网页中的123456789即为该用户的uid |
| 191 | +
|
| 192 | + 把结果存入Album对象,放入公共Queue albums_queue中 |
| 193 | + """ |
| 194 | + # 找出相册列表页 |
| 195 | + html = get(user_url_pattern.format(uid)) |
| 196 | + soup = BS(html, "lxml") |
| 197 | + albums_url = soup.find(name="a", text="相册")["href"] |
| 198 | + uid = re_uid_in_albums_url.findall(albums_url)[0] |
| 199 | + max_page = get_max_page(albums_url) |
| 200 | + # 遍历所有相册 |
| 201 | + for i in range(max_page): |
| 202 | + # 组装网址 |
| 203 | + url = albums_url_pattern.format(i,uid) |
| 204 | + resolve_albums_page(url) |
| 205 | + log("一共 %s 个相册" % albums_queue.qsize()) |
| 206 | + |
| 207 | + |
| 208 | +def get_max_page(url): |
| 209 | + """解析最大页数 |
| 210 | +
|
| 211 | + 如:(第3/5页) |
| 212 | + 返回类型为整数 |
| 213 | + """ |
| 214 | + soup = BS(get(url), "lxml") |
| 215 | + tmp = soup.find(name = "a", title = "末页") |
| 216 | + if not tmp: |
| 217 | + max_page = 0 # 只有一页 |
| 218 | + else: |
| 219 | + tmp = tmp.find_next(name = "span", class_="gray") |
| 220 | + tmp = str(tmp.string) |
| 221 | + max_page = int(re_max_page.findall(tmp)[0]) |
| 222 | + return max_page |
| 223 | + |
| 224 | + |
| 225 | +def resolve_albums_page(url): |
| 226 | + """传入相册列表页,解析并存入Queue""" |
| 227 | + soup = BS(get(url), "lxml") |
| 228 | + tags = soup.find_all(name="a", class_="p") |
| 229 | + for tag in tags: |
| 230 | + album_url = tag["href"] |
| 231 | + tmp = tag.find_next(name="a", href=album_url) |
| 232 | + album_name = str(tmp.string) |
| 233 | + album_update_time = str( |
| 234 | + tmp.find_next(name="span", class_="ns").string) |
| 235 | + tmpStr = "相册名:%s\n%s" % (album_name, album_update_time) |
| 236 | + log(tmpStr) |
| 237 | + albums_queue.put(Album(album_name, album_url)) |
| 238 | + |
| 239 | + |
| 240 | +def resolve_photos_queue(album): |
| 241 | + """传入Album对象,解析出每张照片页面 |
| 242 | +
|
| 243 | + 将结果存入Queue对象,每个元素为:(album_name, photo_page_url) |
| 244 | + """ |
| 245 | + max_page = get_max_page(album.url) # album.url即照片列表第一页 |
| 246 | + for i in range(max_page): |
| 247 | + url = photos_url_pattern.format(i, album.id, album.uid) |
| 248 | + resolve_photos_page(url, album.name) |
| 249 | + log("线程 %s 已解析相册:%s" % (threading.current_thread().name, album.name)) |
| 250 | + |
| 251 | + |
| 252 | +def resolve_photos_page(url, album_name): |
| 253 | + """传入照片列表页,解析出每张照片即photo对象""" |
| 254 | + soup = BS(get(url), "lxml") |
| 255 | + table = soup.find(name="table", class_="p") |
| 256 | + tags = table.find_all(name="a", href=re.compile(r"^http://")) |
| 257 | + for tag in tags: |
| 258 | + photo_page_url = tag["href"] |
| 259 | + photo_url = get_photo_url(photo_page_url) |
| 260 | + photos_queue.put(Photo(album_name, photo_url, photo_page_url)) |
| 261 | + |
| 262 | + |
| 263 | +def get_photo_url(photo_page_url): |
| 264 | + """解析并返回每张照片的url""" |
| 265 | + soup = BS(get(photo_page_url), "lxml") |
| 266 | + tag = soup.find(name="a", text="下载") |
| 267 | + photo_url = tag["href"] |
| 268 | + return photo_url |
| 269 | + |
| 270 | + |
| 271 | +def down_photo(photo, filepath): |
| 272 | + """下载照片""" |
| 273 | + # 排除不能作为文件名的字符 |
| 274 | + album_name = adjust_filename(photo.album_name) |
| 275 | + photo_name = adjust_filename(photo.name) |
| 276 | + # 保存照片的文件夹 |
| 277 | + filepath = os.path.join(filepath, album_name) |
| 278 | + if not os.path.isdir(filepath): |
| 279 | + os.mkdir(filepath) |
| 280 | + # 照片 |
| 281 | + file = os.path.join(filepath, photo_name) |
| 282 | + with open(file, "wb") as f: |
| 283 | + html = get(photo.url, True) |
| 284 | + f.write(html) |
| 285 | + log("已下载 %s\n%s" % (file, photo.url)) |
| 286 | + |
| 287 | + |
| 288 | +def adjust_filename(filename): |
| 289 | + """删掉不能出现在文件名中的字符""" |
| 290 | + return re_filename.sub("", filename) |
| 291 | + |
| 292 | +# 秒-->时分秒 |
| 293 | + |
| 294 | + |
| 295 | +def trans_time(sec): |
| 296 | + hour = int(sec / 3600) |
| 297 | + sec = sec % 3600 |
| 298 | + minute = int(sec / 60) |
| 299 | + sec = sec % 60 |
| 300 | + return "%s小时 %s分 %s秒" % (hour, minute, sec) |
| 301 | + |
| 302 | + |
| 303 | +class Album(object): |
| 304 | + |
| 305 | + """相册类""" |
| 306 | + count = 0 |
| 307 | + re_uid = re.compile(r'owner=(\d+)&') |
| 308 | + re_album_id = re.compile(r'id=(\d+)&') |
| 309 | + |
| 310 | + def __init__(self, name, url): |
| 311 | + super(Album, self).__init__() |
| 312 | + self.name = name |
| 313 | + Album.count += 1 |
| 314 | + self.uid = Album.re_uid.findall(url)[0] |
| 315 | + self.id = Album.re_album_id.findall(url)[0] |
| 316 | + # 即相册第一页,精简网址,删掉无用代码 |
| 317 | + index = url.find(self.uid) |
| 318 | + self.url = url[:index+len(self.uid)] |
| 319 | + |
| 320 | + |
| 321 | +class Photo(object): |
| 322 | + |
| 323 | + """照片类""" |
| 324 | + count = Counter() |
| 325 | + re_uid = re.compile(r'owner=(\d+)&') |
| 326 | + re_album_id = re.compile(r'albumid=(\d+)&') |
| 327 | + re_photo_id = re.compile(r'id=(\d+)&albumid') |
| 328 | + |
| 329 | + def __init__(self, album_name, photo_url, photo_page_url): |
| 330 | + super(Photo, self).__init__() |
| 331 | + self.album_name = album_name |
| 332 | + Photo.count[self.album_name] += 1 |
| 333 | + self.url = photo_url # 照片网址 |
| 334 | + index = self.url.rfind(".") |
| 335 | + suffix = self.url[index:] |
| 336 | + self.name = str(Photo.count[self.album_name]) + suffix |
| 337 | + self.uid = Photo.re_uid.findall(photo_page_url)[0] |
| 338 | + self.album_id = Photo.re_album_id.findall(photo_page_url)[0] |
| 339 | + self.id = Photo.re_photo_id.findall(photo_page_url)[0] |
| 340 | + # 精简网址,删掉无用代码 |
| 341 | + index = photo_page_url.find(self.uid) |
| 342 | + self.page_url = photo_page_url[:index+len(self.uid)] |
| 343 | + |
| 344 | + |
| 345 | +if __name__ == '__main__': |
| 346 | + username = "xxx" |
| 347 | + password = "xxx" |
| 348 | + uid = "xxx" |
| 349 | + filepath = "F:\\" |
| 350 | + |
| 351 | + main(username, password, uid, filepath) |
0 commit comments