add files

wistbean · wistbean · commit 531e24bc67d6 · 2019-04-03T22:37:59.000+08:00
diff --git a/README.md b/README.md
@@ -1,2 +1,5 @@
 # learn_python3_spider
-python3爬虫示例，学习python的正确姿势！
+接下来就是，学习python的正确姿势！
+
+peace.
+
diff --git a/dangdang_top_500.py b/dangdang_top_500.py
@@ -0,0 +1,49 @@
+import requests
+import re
+import json
+
+
+def request_dandan(url):
+    try:
+        response = requests.get(url)
+        if response.status_code == 200:
+            return response.text
+    except requests.RequestException:
+        return None
+
+
+def parse_result(html):
+    pattern = re.compile(
+        '<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?<p><span\sclass="price_n">¥(.*?)</span>.*?</li>',
+        re.S)
+    items = re.findall(pattern, html)
+
+    for item in items:
+        yield {
+            'range': item[0],
+            'iamge': item[1],
+            'title': item[2],
+            'recommend': item[3],
+            'author': item[4],
+            'times': item[5],
+            'price': item[6]
+        }
+
+
+def write_item_to_file(item):
+    print('开始写入数据 ====> ' + str(item))
+    with open('book.txt', 'a', encoding='UTF-8') as f:
+        f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+
+def main(page):
+    url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-' + str(page)
+    html = request_dandan(url)
+    items = parse_result(html)  # 解析过滤我们想要的信息
+    for item in items:
+        write_item_to_file(item)
+
+
+if __name__ == "__main__":
+    for i in range(1, 26):
+        main(i)
diff --git a/douban_top_250_books.py b/douban_top_250_books.py
@@ -0,0 +1,67 @@
+import requests
+from bs4 import BeautifulSoup
+import xlwt
+
+
+def request_douban(url):
+    try:
+        response = requests.get(url)
+        if response.status_code == 200:
+            return response.text
+    except requests.RequestException:
+        return None
+
+
+book = xlwt.Workbook(encoding='utf-8', style_compression=0)
+
+sheet = book.add_sheet('豆瓣电影Top250', cell_overwrite_ok=True)
+sheet.write(0, 0, '名称')
+sheet.write(0, 1, '图片')
+sheet.write(0, 2, '排名')
+sheet.write(0, 3, '评分')
+sheet.write(0, 4, '作者')
+sheet.write(0, 5, '简介')
+
+n = 1
+
+
+def save_to_excel(soup):
+    list = soup.find(class_='grid_view').find_all('li')
+
+    for item in list:
+        item_name = item.find(class_='title').string
+        item_img = item.find('a').find('img').get('src')
+        item_index = item.find(class_='').string
+        item_score = item.find(class_='rating_num').string
+        item_author = item.find('p').text
+        if (item.find(class_='inq') != None):
+            item_intr = item.find(class_='inq').string
+
+        # print('爬取电影：' + item_index + ' | ' + item_name +' | ' + item_img +' | ' + item_score +' | ' + item_author +' | ' + item_intr )
+        print('爬取电影：' + item_index + ' | ' + item_name + ' | ' + item_score + ' | ' + item_intr)
+
+        global n
+
+        sheet.write(n, 0, item_name)
+        sheet.write(n, 1, item_img)
+        sheet.write(n, 2, item_index)
+        sheet.write(n, 3, item_score)
+        sheet.write(n, 4, item_author)
+        sheet.write(n, 5, item_intr)
+
+        n = n + 1
+
+
+def main(page):
+    url = 'https://movie.douban.com/top250?start=' + str(page * 25) + '&filter='
+    html = request_douban(url)
+    soup = BeautifulSoup(html, 'lxml')
+    save_to_excel(soup)
+
+
+if __name__ == '__main__':
+
+    for i in range(0, 10):
+        main(i)
+
+book.save(u'豆瓣最受欢迎的250部电影.xlsx')
diff --git a/fuck_bilibili_captcha.py b/fuck_bilibili_captcha.py
@@ -0,0 +1,204 @@
+import time
+import requests
+from PIL import Image
+from selenium import webdriver
+from selenium.webdriver import ActionChains
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+import re
+from io import BytesIO
+
+driver = webdriver.Chrome('/usr/lib/chromium-browser/chromedriver')
+WAIT = WebDriverWait(driver, 10)
+url = 'https://passport.bilibili.com/login'
+
+
+def mergy_Image(image_file, location_list):
+    """
+    将原始图片进行合成
+    :param image_file: 图片文件
+    :param location_list: 图片位置
+    :return: 合成新的图片
+    """
+
+    # 存放上下部分的各个小块
+    upper_half_list = []
+    down_half_list = []
+
+    image = Image.open(image_file)
+
+    # 通过 y 的位置来判断是上半部分还是下半部分,然后切割
+    for location in location_list:
+        if location['y'] == -58:
+            # 间距为10，y：58-116
+            im = image.crop((abs(location['x']), 58, abs(location['x'])+10, 116))
+            upper_half_list.append(im)
+        if location['y'] == 0:
+            # 间距为10，y：0-58
+            im = image.crop((abs(location['x']), 0, abs(location['x']) + 10, 58))
+            down_half_list.append(im)
+
+    # 创建一张大小一样的图片
+    new_image = Image.new('RGB', (260, 116))
+
+    # 粘贴好上半部分 y坐标是从上到下（0-116）
+    offset = 0
+    for im in upper_half_list:
+        new_image.paste(im, (offset, 0))
+        offset += 10
+
+    # 粘贴好下半部分
+    offset = 0
+    for im in down_half_list:
+        new_image.paste(im, (offset, 58))
+        offset += 10
+
+    return new_image
+
+
+def get_distance(bg_Image, fullbg_Image):
+
+    # 阈值
+    threshold = 200
+
+    print(bg_Image.size[0])
+    print(bg_Image.size[1])
+
+
+    for i in range(60, bg_Image.size[0]):
+        for j in range(bg_Image.size[1]):
+            bg_pix = bg_Image.getpixel((i, j))
+            fullbg_pix = fullbg_Image.getpixel((i, j))
+            r = abs(bg_pix[0] - fullbg_pix[0])
+            g = abs(bg_pix[1] - fullbg_pix[1])
+            b = abs(bg_pix[2] - fullbg_pix[2])
+
+            if r + g + b > threshold:
+               return i
+
+
+
+
+def get_path(distance):
+        result = []
+        current = 0
+        mid = distance * 4 / 5
+        t = 0.2
+        v = 0
+        while current < (distance - 10):
+            if current < mid:
+                a = 2
+            else:
+                a = -3
+            v0 = v
+            v = v0 + a * t
+            s = v0 * t + 0.5 * a * t * t
+            current += s
+            result.append(round(s))
+        return result
+
+
+def start_drag(driver, distance):
+
+    # 被妖怪吃掉了
+    # knob =  WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#gc-box > div > div.gt_slider > div.gt_slider_knob.gt_show")))
+    # ActionChains(driver).click_and_hold(knob).perform()
+    # ActionChains(driver).move_by_offset(xoffset=distance, yoffset=0.1).perform()
+    # time.sleep(0.5)
+    # ActionChains(driver).release(knob).perform()
+
+    # 被妖怪吃掉了
+    # ActionChains(driver).drag_and_drop_by_offset(knob, distance-10, 0).perform()
+
+    knob = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#gc-box > div > div.gt_slider > div.gt_slider_knob.gt_show")))
+    result = get_path(distance)
+    ActionChains(driver).click_and_hold(knob).perform()
+
+    for x in result:
+        ActionChains(driver).move_by_offset(xoffset=x, yoffset=0).perform()
+
+    time.sleep(0.5)
+    ActionChains(driver).release(knob).perform()
+
+
+def recognize_code(driver):
+    """
+    识别滑动验证码
+    :param driver: selenium驱动
+    :return:
+    """
+
+    bs = BeautifulSoup(driver.page_source,'lxml')
+    # 找到背景图片和缺口图片的div
+    bg_div = bs.find_all(class_='gt_cut_bg_slice')
+    fullbg_div = bs.find_all(class_='gt_cut_fullbg_slice')
+
+    # 获取缺口背景图片url
+    bg_url = re.findall('background-image:\surl\("(.*?)"\)',bg_div[0].get('style'))
+    # 获取背景图片url
+    fullbg_url = re.findall('background-image:\surl\("(.*?)"\)',fullbg_div[0].get('style'))
+
+    # 存放每个合成缺口背景图片的位置
+    bg_location_list = []
+    # 存放每个合成背景图片的位置
+    fullbg_location_list = []
+
+    for bg in bg_div:
+        location = {}
+        location['x'] = int(re.findall('background-position:\s(.*?)px\s(.*?)px;', bg.get('style'))[0][0])
+        location['y'] = int(re.findall('background-position:\s(.*?)px\s(.*?)px;', bg.get('style'))[0][1])
+        bg_location_list.append(location)
+
+    for fullbg in fullbg_div:
+        location = {}
+        location['x'] = int(re.findall('background-position:\s(.*?)px\s(.*?)px;', fullbg.get('style'))[0][0])
+        location['y'] = int(re.findall('background-position:\s(.*?)px\s(.*?)px;', fullbg.get('style'))[0][1])
+        fullbg_location_list.append(location)
+
+    print(bg_location_list)
+    print(fullbg_location_list)
+
+    # 将图片格式存为 jpg 格式
+    bg_url = bg_url[0].replace('webp', 'jpg')
+    fullbg_url = fullbg_url[0].replace('webp', 'jpg')
+    # print(bg_url)
+    # print(fullbg_url)
+
+    # 下载图片
+    bg_image = requests.get(bg_url).content
+    fullbg_image = requests.get(fullbg_url).content
+    print('完成图片下载')
+
+    # 写入图片
+    bg_image_file = BytesIO(bg_image)
+    fullbg_image_file = BytesIO(fullbg_image)
+
+    # 合成图片
+    bg_Image = mergy_Image(bg_image_file, bg_location_list)
+    fullbg_Image = mergy_Image(fullbg_image_file, fullbg_location_list)
+    # bg_Image.show()
+    # fullbg_Image.show()
+
+    # 计算缺口偏移距离
+    distance = get_distance(bg_Image, fullbg_Image)
+    print('得到距离：%s' % str(distance))
+
+    start_drag(driver, distance)
+
+
+
+
+if __name__ == '__main__':
+
+    # 获取滑块按钮
+    driver.get(url)
+    slider = WAIT.until(EC.element_to_be_clickable(
+        (By.CSS_SELECTOR, "#gc-box > div > div.gt_slider > div.gt_slider_knob.gt_show")))
+
+    recognize_code(driver)
+
+
+    # driver.close()
+
diff --git a/ikun_basketball.py b/ikun_basketball.py
diff --git a/meizitu.py b/meizitu.py