-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCrime-Web-Scraping.py
More file actions
143 lines (93 loc) · 3.47 KB
/
Crime-Web-Scraping.py
File metadata and controls
143 lines (93 loc) · 3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# # Webscraping in Python
from bs4 import BeautifulSoup
import requests
# GET request
url = 'https://en.wikipedia.org/wiki/California_locations_by_crime_rate'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
print(soup)
table0 = soup.find_all('table')[0]
print(table0)
table1 = soup.find_all('table')[1]
print(table1)
table2 = soup.find_all('table')[2]
print(table2)
table1col = table1.find_all('th')
print(table1col)
table1columns = [title.text.strip() for title in table1col]
print(table1columns)
#remove brackets & stuff inside
import re
def remove_brackets(text):
return re.sub(r'\[.*?\]', '', text).strip()
# Cleaned column titles
table1columns = [remove_brackets(title) for title in table1columns]
table1columns
table2col = table2.find_all('th')
print(table2col)
table2columns = [title.text.strip() for title in table2col]
print(table2columns)
#remove brackets
def remove_brackets(text):
return re.sub(r'\[.*?\]', '', text).strip()
# Cleaned column titles
table2columns = [remove_brackets(title) for title in table2columns]
print(table2columns)
import pandas as pd
df1 = pd.DataFrame(columns = table1columns)
df1
df2 = pd.DataFrame(columns = table2columns)
df2
col1 = table1.find_all('tr')
print(col1)
for row in col1[1:]:
row1 = table1.find_all('td')
row1data = [data.text.strip() for data in row1]
print(row1data)
for i in range(0, len(row1data), 7):
chunk = row1data[i:i+7]
if len(chunk) == 7:
df1.loc[len(df1)] = chunk
#Remove commas
df1 = df1.replace(',','',regex=True)
print(df1)
df2
col2 = table2.find_all('tr')
for row in col2[1:]:
row2 = table2.find_all('td')
row2data = [data.text.strip() for data in row2]
print(row2data)
print(df2)
for i in range(0, len(row2data), 8):
chunk = row2data[i:i+8]
if len(chunk) == 8:
df2.loc[len(df2)] = chunk
df2 = df2.replace(',','',regex=True)
print(df2)
df2
df1
print(df1.dtypes)
df1['County'] = df1['County'].astype('category')
df1['Population'] = pd.to_numeric(df1['Population'], errors='coerce')
df1['Populationdensity'] = pd.to_numeric(df1['Populationdensity'], errors='coerce')
df1['Violent crimes'] = pd.to_numeric(df1['Violent crimes'], errors='coerce')
df1['Violent crime rateper 1,000 persons'] = pd.to_numeric(df1['Violent crime rateper 1,000 persons'], errors='coerce')
df1['Property crimes '] = pd.to_numeric(df1['Property crimes'], errors='coerce')
df1['Property crime rateper 1,000 persons'] = pd.to_numeric(df1['Property crime rateper 1,000 persons'], errors='coerce')
print(df1.dtypes)
df1_encoded = pd.get_dummies(df1, columns=['County'], drop_first=False)
df1_encoded = df1_encoded.astype(int)
print(df1_encoded)
print(df2.dtypes)
df2['City/Agency'] = df2['City/Agency'].astype('category')
df2['County'] = df2['County'].astype('category')
df2['Population'] = pd.to_numeric(df2['Population'], errors='coerce')
df2['Populationdensity'] = pd.to_numeric(df2['Populationdensity'], errors='coerce')
df2['Violent crimes'] = pd.to_numeric(df2['Violent crimes'], errors='coerce')
df2['Violent crime rateper 1,000 persons'] = pd.to_numeric(df2['Violent crime rateper 1,000 persons'], errors='coerce')
df2['Property crimes'] = pd.to_numeric(df2['Property crimes'], errors='coerce')
df2['Property crime rateper 1,000 persons'] = pd.to_numeric(df2['Property crime rateper 1,000 persons'], errors='coerce')
print(df2.dtypes)
df2_encoded = pd.get_dummies(df2, columns=['County','City/Agency' ], drop_first=False)
df2_encoded = df2_encoded.astype(int)
print(df2_encoded)