-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtop250.py
67 lines (62 loc) · 2.52 KB
/
top250.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding:utf-8 -*-
import requests
from lxml import html
import MySQLdb
import time
def getTop250(cursor, db, start=0):
payload = {'start': start, 'filter': ''}
start_url = 'https://movie.douban.com/top250'
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Host':'movie.douban.com',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch, br',
'Connection':'keep-alive'
}
response = requests.get(start_url,headers = headers, params=payload)
# print response.text
res = html.fromstring(response.text)
li = res.xpath('//ol[@class="grid_view"]/li')
for i in li:
try:
title = "".join(i.xpath('string(./div[@class="item"]/div[@class="info"]/div[@class="hd"]/a)').split())
except:
title = None
try:
content = "".join(i.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/p/text()'))
except:
content = None
try:
star = i.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]
except:
star = None
try:
count = i.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]
except:
count = None
try:
quote = i.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()')[0]
except:
quote = None
# print title,content,star,count,quote
# SQL 插入语句
sql = '''INSERT INTO top250(title, content, star, count, quote) VALUES ("%s", "%s", "%s", "%s","%s")'''%(title, content, star, count, quote)
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
db.commit()
print(title + ' commit')
except Exception as e:
# Rollback in case there is any error
db.rollback()
print(e)
if __name__ == '__main__':
# 打开数据库连接
db = MySQLdb.connect(host="localhost", user="root", passwd="", db="doubanmovie", charset='utf8')
# 使用cursor()方法获取操作游标
cursor = db.cursor()
for i in range(0,10):
getTop250(cursor, db, i*25)
# 暂停10s
time.sleep(10)