Skip to content

Commit 531e24b

Browse files
committed
add files
1 parent d9bb4dd commit 531e24b

6 files changed

+531
-1
lines changed

README.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
# learn_python3_spider
2-
python3爬虫示例,学习python的正确姿势!
2+
接下来就是,学习python的正确姿势!
3+
4+
peace.
5+

dangdang_top_500.py

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import requests
2+
import re
3+
import json
4+
5+
6+
def request_dandan(url):
7+
try:
8+
response = requests.get(url)
9+
if response.status_code == 200:
10+
return response.text
11+
except requests.RequestException:
12+
return None
13+
14+
15+
def parse_result(html):
16+
pattern = re.compile(
17+
'<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?<p><span\sclass="price_n">¥(.*?)</span>.*?</li>',
18+
re.S)
19+
items = re.findall(pattern, html)
20+
21+
for item in items:
22+
yield {
23+
'range': item[0],
24+
'iamge': item[1],
25+
'title': item[2],
26+
'recommend': item[3],
27+
'author': item[4],
28+
'times': item[5],
29+
'price': item[6]
30+
}
31+
32+
33+
def write_item_to_file(item):
34+
print('开始写入数据 ====> ' + str(item))
35+
with open('book.txt', 'a', encoding='UTF-8') as f:
36+
f.write(json.dumps(item, ensure_ascii=False) + '\n')
37+
38+
39+
def main(page):
40+
url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-' + str(page)
41+
html = request_dandan(url)
42+
items = parse_result(html) # 解析过滤我们想要的信息
43+
for item in items:
44+
write_item_to_file(item)
45+
46+
47+
if __name__ == "__main__":
48+
for i in range(1, 26):
49+
main(i)

douban_top_250_books.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import xlwt
4+
5+
6+
def request_douban(url):
7+
try:
8+
response = requests.get(url)
9+
if response.status_code == 200:
10+
return response.text
11+
except requests.RequestException:
12+
return None
13+
14+
15+
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
16+
17+
sheet = book.add_sheet('豆瓣电影Top250', cell_overwrite_ok=True)
18+
sheet.write(0, 0, '名称')
19+
sheet.write(0, 1, '图片')
20+
sheet.write(0, 2, '排名')
21+
sheet.write(0, 3, '评分')
22+
sheet.write(0, 4, '作者')
23+
sheet.write(0, 5, '简介')
24+
25+
n = 1
26+
27+
28+
def save_to_excel(soup):
29+
list = soup.find(class_='grid_view').find_all('li')
30+
31+
for item in list:
32+
item_name = item.find(class_='title').string
33+
item_img = item.find('a').find('img').get('src')
34+
item_index = item.find(class_='').string
35+
item_score = item.find(class_='rating_num').string
36+
item_author = item.find('p').text
37+
if (item.find(class_='inq') != None):
38+
item_intr = item.find(class_='inq').string
39+
40+
# print('爬取电影:' + item_index + ' | ' + item_name +' | ' + item_img +' | ' + item_score +' | ' + item_author +' | ' + item_intr )
41+
print('爬取电影:' + item_index + ' | ' + item_name + ' | ' + item_score + ' | ' + item_intr)
42+
43+
global n
44+
45+
sheet.write(n, 0, item_name)
46+
sheet.write(n, 1, item_img)
47+
sheet.write(n, 2, item_index)
48+
sheet.write(n, 3, item_score)
49+
sheet.write(n, 4, item_author)
50+
sheet.write(n, 5, item_intr)
51+
52+
n = n + 1
53+
54+
55+
def main(page):
56+
url = 'https://movie.douban.com/top250?start=' + str(page * 25) + '&filter='
57+
html = request_douban(url)
58+
soup = BeautifulSoup(html, 'lxml')
59+
save_to_excel(soup)
60+
61+
62+
if __name__ == '__main__':
63+
64+
for i in range(0, 10):
65+
main(i)
66+
67+
book.save(u'豆瓣最受欢迎的250部电影.xlsx')

fuck_bilibili_captcha.py

+204
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
import time
2+
import requests
3+
from PIL import Image
4+
from selenium import webdriver
5+
from selenium.webdriver import ActionChains
6+
from selenium.webdriver.common.by import By
7+
from selenium.webdriver.support.ui import WebDriverWait
8+
from selenium.webdriver.support import expected_conditions as EC
9+
from bs4 import BeautifulSoup
10+
import re
11+
from io import BytesIO
12+
13+
driver = webdriver.Chrome('/usr/lib/chromium-browser/chromedriver')
14+
WAIT = WebDriverWait(driver, 10)
15+
url = 'https://passport.bilibili.com/login'
16+
17+
18+
def mergy_Image(image_file, location_list):
19+
"""
20+
将原始图片进行合成
21+
:param image_file: 图片文件
22+
:param location_list: 图片位置
23+
:return: 合成新的图片
24+
"""
25+
26+
# 存放上下部分的各个小块
27+
upper_half_list = []
28+
down_half_list = []
29+
30+
image = Image.open(image_file)
31+
32+
# 通过 y 的位置来判断是上半部分还是下半部分,然后切割
33+
for location in location_list:
34+
if location['y'] == -58:
35+
# 间距为10,y:58-116
36+
im = image.crop((abs(location['x']), 58, abs(location['x'])+10, 116))
37+
upper_half_list.append(im)
38+
if location['y'] == 0:
39+
# 间距为10,y:0-58
40+
im = image.crop((abs(location['x']), 0, abs(location['x']) + 10, 58))
41+
down_half_list.append(im)
42+
43+
# 创建一张大小一样的图片
44+
new_image = Image.new('RGB', (260, 116))
45+
46+
# 粘贴好上半部分 y坐标是从上到下(0-116)
47+
offset = 0
48+
for im in upper_half_list:
49+
new_image.paste(im, (offset, 0))
50+
offset += 10
51+
52+
# 粘贴好下半部分
53+
offset = 0
54+
for im in down_half_list:
55+
new_image.paste(im, (offset, 58))
56+
offset += 10
57+
58+
return new_image
59+
60+
61+
def get_distance(bg_Image, fullbg_Image):
62+
63+
# 阈值
64+
threshold = 200
65+
66+
print(bg_Image.size[0])
67+
print(bg_Image.size[1])
68+
69+
70+
for i in range(60, bg_Image.size[0]):
71+
for j in range(bg_Image.size[1]):
72+
bg_pix = bg_Image.getpixel((i, j))
73+
fullbg_pix = fullbg_Image.getpixel((i, j))
74+
r = abs(bg_pix[0] - fullbg_pix[0])
75+
g = abs(bg_pix[1] - fullbg_pix[1])
76+
b = abs(bg_pix[2] - fullbg_pix[2])
77+
78+
if r + g + b > threshold:
79+
return i
80+
81+
82+
83+
84+
def get_path(distance):
85+
result = []
86+
current = 0
87+
mid = distance * 4 / 5
88+
t = 0.2
89+
v = 0
90+
while current < (distance - 10):
91+
if current < mid:
92+
a = 2
93+
else:
94+
a = -3
95+
v0 = v
96+
v = v0 + a * t
97+
s = v0 * t + 0.5 * a * t * t
98+
current += s
99+
result.append(round(s))
100+
return result
101+
102+
103+
def start_drag(driver, distance):
104+
105+
# 被妖怪吃掉了
106+
# knob = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#gc-box > div > div.gt_slider > div.gt_slider_knob.gt_show")))
107+
# ActionChains(driver).click_and_hold(knob).perform()
108+
# ActionChains(driver).move_by_offset(xoffset=distance, yoffset=0.1).perform()
109+
# time.sleep(0.5)
110+
# ActionChains(driver).release(knob).perform()
111+
112+
# 被妖怪吃掉了
113+
# ActionChains(driver).drag_and_drop_by_offset(knob, distance-10, 0).perform()
114+
115+
knob = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#gc-box > div > div.gt_slider > div.gt_slider_knob.gt_show")))
116+
result = get_path(distance)
117+
ActionChains(driver).click_and_hold(knob).perform()
118+
119+
for x in result:
120+
ActionChains(driver).move_by_offset(xoffset=x, yoffset=0).perform()
121+
122+
time.sleep(0.5)
123+
ActionChains(driver).release(knob).perform()
124+
125+
126+
def recognize_code(driver):
127+
"""
128+
识别滑动验证码
129+
:param driver: selenium驱动
130+
:return:
131+
"""
132+
133+
bs = BeautifulSoup(driver.page_source,'lxml')
134+
# 找到背景图片和缺口图片的div
135+
bg_div = bs.find_all(class_='gt_cut_bg_slice')
136+
fullbg_div = bs.find_all(class_='gt_cut_fullbg_slice')
137+
138+
# 获取缺口背景图片url
139+
bg_url = re.findall('background-image:\surl\("(.*?)"\)',bg_div[0].get('style'))
140+
# 获取背景图片url
141+
fullbg_url = re.findall('background-image:\surl\("(.*?)"\)',fullbg_div[0].get('style'))
142+
143+
# 存放每个合成缺口背景图片的位置
144+
bg_location_list = []
145+
# 存放每个合成背景图片的位置
146+
fullbg_location_list = []
147+
148+
for bg in bg_div:
149+
location = {}
150+
location['x'] = int(re.findall('background-position:\s(.*?)px\s(.*?)px;', bg.get('style'))[0][0])
151+
location['y'] = int(re.findall('background-position:\s(.*?)px\s(.*?)px;', bg.get('style'))[0][1])
152+
bg_location_list.append(location)
153+
154+
for fullbg in fullbg_div:
155+
location = {}
156+
location['x'] = int(re.findall('background-position:\s(.*?)px\s(.*?)px;', fullbg.get('style'))[0][0])
157+
location['y'] = int(re.findall('background-position:\s(.*?)px\s(.*?)px;', fullbg.get('style'))[0][1])
158+
fullbg_location_list.append(location)
159+
160+
print(bg_location_list)
161+
print(fullbg_location_list)
162+
163+
# 将图片格式存为 jpg 格式
164+
bg_url = bg_url[0].replace('webp', 'jpg')
165+
fullbg_url = fullbg_url[0].replace('webp', 'jpg')
166+
# print(bg_url)
167+
# print(fullbg_url)
168+
169+
# 下载图片
170+
bg_image = requests.get(bg_url).content
171+
fullbg_image = requests.get(fullbg_url).content
172+
print('完成图片下载')
173+
174+
# 写入图片
175+
bg_image_file = BytesIO(bg_image)
176+
fullbg_image_file = BytesIO(fullbg_image)
177+
178+
# 合成图片
179+
bg_Image = mergy_Image(bg_image_file, bg_location_list)
180+
fullbg_Image = mergy_Image(fullbg_image_file, fullbg_location_list)
181+
# bg_Image.show()
182+
# fullbg_Image.show()
183+
184+
# 计算缺口偏移距离
185+
distance = get_distance(bg_Image, fullbg_Image)
186+
print('得到距离:%s' % str(distance))
187+
188+
start_drag(driver, distance)
189+
190+
191+
192+
193+
if __name__ == '__main__':
194+
195+
# 获取滑块按钮
196+
driver.get(url)
197+
slider = WAIT.until(EC.element_to_be_clickable(
198+
(By.CSS_SELECTOR, "#gc-box > div > div.gt_slider > div.gt_slider_knob.gt_show")))
199+
200+
recognize_code(driver)
201+
202+
203+
# driver.close()
204+

0 commit comments

Comments
 (0)