-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathbuilddb.sql
161 lines (141 loc) · 3.78 KB
/
builddb.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
create table input.bikeparking (
ADDRESS text,
LOCATION_NAME text,
STREET_NAME text,
RACKS int,
SPACES int,
PLACEMENT text,
MO_INSTALLED text,
YR_INSTALLED text,
COORDINATES text
);
create table input.crimedata(
IncidntNum INT,
Category text,
Descrip text,
DayOfWeek text,
IncidentDate DATE,
IncidentTime Time,
PdDistrict text,
Resolution text,
Address text,
xcoord numeric,
ycoord numeric,
Location point,
Pdid BIGINT
);
create table risk (
stdev numrange,
risk text );
insert into risk values
( '[ -1000, -2.0 )', 'very safe' ),
( '[ -2.0, -0.7 )', 'safe' ),
( '[ -0.7, -0.3 )', 'kinda safe' ),
( '[ -0.3, 0.5 )', 'average' ),
( '[ 0.5, 1.0 )', 'kinda risky' ),
( '[ 1.0, 2.0 )', 'risky' ),
( '[ 2.0, 4.0 )', 'very risky' ),
( '[ 4.0, 1000 )', 'already stolen' );
create or replace function chisq_p(
popvals float[] )
returns float
language plpythonu
as $f$
from scipy import stats
popmean, p = stats.chisquare(popvals)
return p
$f$;
create aggregate chisq_agg ( float ) (
initcond = '{}',
sfunc = array_append,
stype = FLOAT[],
finalfunc = chisq_p
);
create OR REPLACE function bikecoord (text)
returns text
language sql
as
$f$
with destring as (
select string_to_array(
(string_to_array($1,'
'))[2],',') as coords )
select 'POINT(' || regexp_replace(coords[2],'[ ()]','','g') || ' ' || regexp_replace(coords[1],'[ ()]','','g') || ')'
from destring;
$f$;
create table bikeparking as
select address, location_name,
street_name as street,
racks, spaces,
placement,
(case when yr_installed ~* '\d+' then yr_installed
else null end || '-' ||
case when mo_installed ~* '\d+' then mo_installed
else '01' end || '-01')::date as installed_in,
st_geogfromtext(bikecoord(coordinates)) as location
from input.bikeparking;
create table crimedata as
select incidntnum as incidentid,
category,
descript as description,
dayofweek as dayofweek,
incidentdate + incidenttime as incident_ts,
pddistrict as district,
resolution,
address,
st_geogfromtext('POINT(' || xcoord || ' ' || ycoord || ')' ) as location
from input.crimedata;
create index on bikeparking using gist(location);
create index on crimedata using gist(location)
where description ilike '%bicycle%';
select location_name, bikeparking.address, count(*)
from bikeparking
join crimedata
on st_dwithin(bikeparking.location, crimedata.location, 50)
and crimedata.description ilike '%bicycle%'
group by location_name, bikeparking.address
order by 3 desc;
create materialized view rackthefts as
select location_name, bikeparking.address, count(*)/racks::float as thefts
from bikeparking
join crimedata
on st_dwithin(bikeparking.location, crimedata.location, 50)
and crimedata.description ilike '%bicycle%'
group by location_name, bikeparking.address, racks;
select chisq_agg(thefts)::numeric
from rackthefts;
create or replace function std_compare(
val float, popvals float[])
returns float
language plpythonu
as $f$
import numpy
std = numpy.std(popvals)
avg = numpy.mean(popvals)
stdvar = ( val - avg ) / std
return stdvar
$f$;
with racktheft_window as (
select rackthefts.*,
array_agg(thefts) over () as all_thefts
from rackthefts
)
select location_name,
address,
std_compare(thefts, all_thefts) as risk_factor
from racktheft_window
order by risk_factor desc;
with racktheft_window as (
select rackthefts.*,
array_agg(thefts) over () as all_thefts
from rackthefts
), risk_factors as (
select location_name,
address,
std_compare(thefts, all_thefts) as risk_factor
from racktheft_window
)
select risk_factors.*, risk
from risk_factors
join risk ON risk_factor::numeric <@ stdev
order by risk_factor desc;