-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_bayut.py
145 lines (126 loc) · 5.48 KB
/
scrape_bayut.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm import tqdm
import sys
from datetime import datetime
EMIRATES_VALUES = ['abu-dhabi','dubai','sharjah','ajman','umm-al-quwain','ras-al-khaimah','fujairah']
NUM_PROPERTIES_PER_PAGE = 24
def get_url(furnished='all',emirate='dubai',page=1):
'''
args:
furnished : one of {'all','furnished','unfurnished'}
emirate: one of {'abu-dhabi','dubai','sharjah','ajman','umm-al-quwain','ras-al-khaimah','fujairah'}
page (URL for page number): int
This function returns the appropriate URL for www.bayut.com given the arguments
'''
if page == 1:
url = f'https://www.bayut.com/to-rent/property/{emirate}/'
else:
url = f'https://www.bayut.com/to-rent/property/{emirate}/page-{page}/'
if furnished == 'furnished':
url += '?furnishing_status=furnished'
elif furnished == 'unfurnished':
url += '?furnishing_status=unfurnished'
return url
def scrape_bayut(emirate='dubai',furnished='all',fast_scrape=False):
'''
args:
emirate: one of {'abu-dhabi','dubai','sharjah','ajman','umm-al-quwain','ras-al-khaimah','fujairah'}
furnished : one of {'all','furnished','unfurnished'}
fast_scrape (fast_scrape skips scraping description and amenities, considerably faster): bool
This scrapes www.bayut.com website and saves the scraped data in a csv file
'''
print(f"Starting scrape for {emirate.capitalize()}..")
emirate = emirate.lower().replace(' ','-')
assert emirate in EMIRATES_VALUES, f'emirate attr must be one of {EMIRATES_VALUES}'
assert furnished in ['all','furnished','unfurnished'], f"furnished attr must be one of {['all','furnished','unfurnished']}"
bedrooms,bathrooms,area,prices,locations,property_types,property_keywords,furnished_bool, descriptions, amenities = [],[],[],[],[],[],[],[],[],[]
url = get_url(furnished=furnished,emirate=emirate,page=1)
html_text = requests.get(url).content
soup = BeautifulSoup(html_text,'lxml')
num_properties = int(soup.find('span',class_='ca3976f7').text.split(' ')[-2].replace(',',''))
pages = (num_properties//NUM_PROPERTIES_PER_PAGE) + 1
print(f"Found {num_properties} properties with furnished={furnished} ({pages} pages)")
for page in tqdm(range(1,pages)):
try:
url = get_url(furnished=furnished,emirate=emirate,page=page)
html_text = requests.get(url).content
soup = BeautifulSoup(html_text,'lxml')
properties = soup.find_all('div',class_='d6e81fd0')
for property in properties:
try:
prices.append(property.find('span',class_='f343d9ce').text)
except:
prices.append(-1)
try:
locations.append(property.find('div',class_='_7afabd84').text)
except:
locations.append(-1)
try:
property_types.append(property.find('div',class_='_9a4e3964').text)
except:
property_types.append(-1)
try:
property_keywords.append(property.find('h2',class_='_7f17f34f').text)
except:
property_keywords.append(-1)
temp = []
for i in property.find('div',class_='_22b2f6ed').children:
try:
temp.append(i.text)
except:
temp.append(-1)
try:
bedrooms.append(temp[0])
except:
bedrooms.append(-1)
try:
bathrooms.append(temp[1])
except:
bathrooms.append(-1)
try:
area.append(temp[2])
except:
area.append(-1)
if furnished != 'all':
furnished_bool.append(1 if furnished else 0)
card = soup.find('div',class_='_4041eb80')
if fast_scrape:
continue
ppty_url = 'https://bayut.com'+card.find('a')['href']
ppty_html = requests.get(ppty_url).content
soup_ppty = BeautifulSoup(ppty_html,'lxml')
try:
descriptions.append(soup_ppty.find('span',class_='_2a806e1e').text)
except:
descriptions.append(-1)
try:
amenities.append(soup_ppty.find('div',class_='e475b606').text)
except:
amenities.append(-1)
except Exception as e:
print(e)
print(f"Exiting early.. scraped {page-1}/{pages}")
break
col_dict = {
"bedrooms" : bedrooms,
"bathrooms": bathrooms,
"area": area,
"prices" : prices,
"locations" : locations,
"property_types" : property_types,
"property_keywords" : property_keywords,
"furnished": furnished_bool,
"description": descriptions,
"amenities": amenities
}
if furnished == 'all':
del col_dict['furnished']
df = pd.DataFrame(col_dict)
if fast_scrape:
del descriptions
del amenities
df.to_csv(f'properties_{emirate}_furnished={furnished}_{str(datetime.now()).split(".")[0]}.csv',index=False)
if __name__ == '__main__':
scrape_bayut(emirate='dubai',furnished='all',fast_scrape=False)