Skip to content

Commit 4a927af

Browse files
committed
Create sechouse.py
1 parent 50f7878 commit 4a927af

File tree

1 file changed

+152
-0
lines changed

1 file changed

+152
-0
lines changed

jiguang/fang/sechouse.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import random
2+
import requests
3+
from bs4 import BeautifulSoup
4+
import re
5+
import math
6+
from lxml import etree
7+
8+
USER_AGENTS = [
9+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
10+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
11+
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
12+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
13+
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
14+
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
15+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
16+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
17+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
18+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
19+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
20+
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
21+
]
22+
chinese_city_district_dict = dict()
23+
chinese_area_dict = dict()
24+
25+
def create_headers():
26+
headers = dict()
27+
headers["User-Agent"] = random.choice(USER_AGENTS)
28+
headers["Referer"] = "http://www.ke.com"
29+
return headers
30+
31+
class SecHouse(object):
32+
def __init__(self, district, area, name, price, desc, pic):
33+
self.district = district
34+
self.area = area
35+
self.price = price
36+
self.name = name
37+
self.desc = desc
38+
self.pic = pic
39+
40+
def text(self):
41+
return self.district + "," + \
42+
self.area + "," + \
43+
self.name + "," + \
44+
self.price + "," + \
45+
self.desc + "," + \
46+
self.pic
47+
48+
def get_districts():
49+
url = 'https://bj.ke.com/xiaoqu/'
50+
headers = create_headers()
51+
response = requests.get(url, timeout=10, headers=headers)
52+
html = response.content
53+
root = etree.HTML(html)
54+
elements = root.xpath('///div[3]/div[1]/dl[2]/dd/div/div/a')
55+
en_names = list()
56+
ch_names = list()
57+
for element in elements:
58+
link = element.attrib['href']
59+
en_names.append(link.split('/')[-2])
60+
ch_names.append(element.text)
61+
62+
# 打印区县英文和中文名列表
63+
for index, name in enumerate(en_names):
64+
chinese_city_district_dict[name] = ch_names[index]
65+
return en_names
66+
67+
def get_areas(district):
68+
page = "http://bj.ke.com/xiaoqu/{0}".format(district)
69+
areas = list()
70+
try:
71+
headers = create_headers()
72+
response = requests.get(page, timeout=10, headers=headers)
73+
html = response.content
74+
root = etree.HTML(html)
75+
links = root.xpath('//div[3]/div[1]/dl[2]/dd/div/div[2]/a')
76+
77+
# 针对a标签的list进行处理
78+
for link in links:
79+
relative_link = link.attrib['href']
80+
# 去掉最后的"/"
81+
relative_link = relative_link[:-1]
82+
# 获取最后一节
83+
area = relative_link.split("/")[-1]
84+
# 去掉区县名,防止重复
85+
if area != district:
86+
chinese_area = link.text
87+
chinese_area_dict[area] = chinese_area
88+
# print(chinese_area)
89+
areas.append(area)
90+
return areas
91+
except Exception as e:
92+
print(e)
93+
94+
with open("sechouse.txt", "w", encoding='utf-8') as f:
95+
# 开始获得需要的板块数据
96+
total_page = 1
97+
sec_house_list = list()
98+
districts = get_districts()
99+
for district in districts:
100+
arealist = get_areas(district)
101+
for area in arealist:
102+
# 中文区县
103+
chinese_district = chinese_city_district_dict.get(district, "")
104+
# 中文版块
105+
chinese_area = chinese_area_dict.get(area, "")
106+
page = 'http://bj.ke.com/ershoufang/{0}/'.format(area)
107+
print(page)
108+
headers = create_headers()
109+
response = requests.get(page, timeout=10, headers=headers)
110+
html = response.content
111+
soup = BeautifulSoup(html, "lxml")
112+
113+
# 获得总的页数
114+
try:
115+
page_box = soup.find_all('div', class_='page-box')[0]
116+
matches = re.search('.*data-total-count="(\d+)".*', str(page_box))
117+
total_page = int(math.ceil(int(matches.group(1)) / 10))
118+
except Exception as e:
119+
print(e)
120+
121+
print(total_page)
122+
# 从第一页开始,一直遍历到最后一页
123+
headers = create_headers()
124+
for i in range(1, total_page + 1):
125+
page = 'http://bj.ke.com/ershoufang/{0}/pg{1}'.format(area,i)
126+
print(page)
127+
response = requests.get(page, timeout=10, headers=headers)
128+
html = response.content
129+
soup = BeautifulSoup(html, "lxml")
130+
131+
# 获得有小区信息的panel
132+
house_elements = soup.find_all('li', class_="clear")
133+
for house_elem in house_elements:
134+
price = house_elem.find('div', class_="totalPrice")
135+
name = house_elem.find('div', class_='title')
136+
desc = house_elem.find('div', class_="houseInfo")
137+
pic = house_elem.find('a', class_="img").find('img', class_="lj-lazy")
138+
139+
# 继续清理数据
140+
price = price.text.strip()
141+
name = name.text.replace("\n", "")
142+
desc = desc.text.replace("\n", "").strip()
143+
pic = pic.get('data-original').strip()
144+
145+
# 作为对象保存
146+
sec_house = SecHouse(chinese_district, chinese_area, name, price, desc, pic)
147+
print(sec_house.text())
148+
sec_house_list.append(sec_house)
149+
150+
for sec_house in sec_house_list:
151+
f.write(sec_house.text() + "\n")
152+

0 commit comments

Comments
 (0)