-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy path3-4_5.py
298 lines (245 loc) · 8.47 KB
/
3-4_5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#%%
# Regression has a different output format than classification --
# it is in real values, good old floating point numbers.
# This particular dataset also has real valued data points,
# so we're
# going to update our data loader to be more configurable column
# wise how to encode values. We'll start be brining back our one
# hot column encoder.
#%%
import torch
import pandas
class OneHotEncoder():
def __init__(self, series):
'''Given a single pandas series, create an encoder
that can turn values from that series into a one hot
pytorch tensor.
Arguments:
series {pandas.Series} -- encode this
'''
unique_values = series.unique()
self.ordinals = {
val: i for i, val in enumerate(unique_values)}
self.encoder = torch.eye(
len(unique_values), len(unique_values))
def __getitem__(self, value):
'''Turn a value into a tensor
Arguments:
value {} -- Value to encode
but most likely a string
Returns:
[torch.Tensor] -- a one dimensional tensor
'''
return self.encoder[self.ordinals[value]]
#%%
# now we could make an encoder for numerical values,
# but the values
# already are numbers, so this is in effect 'no encoder' -- i
# so we'll implement it that way.
# time to load up the dataset and learn about our columns
#%%
look = pandas.read_csv('./kc_house_data.csv')
look.iloc[0]
#%%
# looking at that data, let's make a configuration of the
# categorical columns, some of these are judgement -- for example
# it's easy to think of a 1-5 scale as categorical or
# as real valued, that
# will be something to experiment with in the assignment
#%%
categorical = [
'waterfront',
'view',
'condition',
'grade',
]
#%%
# One interesting column in there -- id -- that one doesn't look like real
# data, just a key from a database. Values that are 'unique' like this
# you need to throw out, otherwise you can end up making a
# machine learning hash-table!
# This isn't a hard and fast rule, but a good one to think
# about in practice
# any unique value in a sample isn't likely to generalize well,
# there isn't
# any data for the network to comapre.
# Whether it is a unique keyword
# or a unique number value, be on the lookout for these.
#%%
discard = [
'id'
]
#%%
# And the really tricky bit -- look at that date column.
# Time is a tricky
# one to think about, as there are seasonal effects,
# and in some sense
# we recognize this in how we write out time --
# years, months, and days
# let's break this feature into three numerical features.
#%%
import dateutil
class DateEncoder():
def __getitem__(self, datestring):
'''Encode into a tensor [year, month, date]
given an input date string.
Arguments:
datestring {string} -- date string, ISO format
'''
parsed = dateutil.parser.parse(datestring)
return torch.Tensor(
[parsed.year, parsed.month, parsed.day])
dates = ['date']
DateEncoder()['20141013T000000']
#%%
from torch.utils.data import Dataset
class MixedCSV(Dataset):
def __init__(self, datafile, output_series_name,
date_series_names, categorical_series_names,
ignore_series_names):
'''Load the dataset and create needed encoders for
each series.
Arguments:
datafile {string} -- path to data file
output_series_name {string} -- use this series/column as output
date_series_names {list} -- column names of dates
categorical_series_names {list} -- column names
ignore_series_names {list} -- column names to skip
'''
self.dataset = pandas.read_csv(datafile)
self.output_series_name = output_series_name
self.encoders = {}
for series_name in date_series_names:
self.encoders[series_name] = DateEncoder()
for series_name in categorical_series_names:
self.encoders[series_name] = OneHotEncoder(
self.dataset[series_name]
)
self.ignore = ignore_series_names
def __len__(self):
return len(self.dataset)
def __getitem__(self, index):
'''Return an (input, output) tensor tuple
with all categories one hot encoded.
Arguments:
index {[type]} -- [description]
'''
if type(index) is torch.Tensor:
index = index.item()
sample = self.dataset.iloc[index]
output = torch.Tensor([sample[self.output_series_name]])
input_components = []
for name, value in sample.items():
if name in self.ignore:
continue
elif name in self.encoders:
input_components.append(
self.encoders[name][value]
)
else:
input_components.append(torch.Tensor([value]))
input = torch.cat(input_components)
return input, output
houses = MixedCSV('./kc_house_data.csv',
'price',
dates,
categorical,
discard
)
houses[0]
#%%
# 3-5
# The big differences for a regression network are in the output
# rather than a softmax or a probability,
# we can simply emit a real valued # number
# Depending on the model -
# this number isn't simply a 0-1, and in our case
# we're looking to emit a price in dollars,
# so it'll be 6 figures.
#%%
class Model(torch.nn.Module):
def __init__(self, input_dimensions, size=128):
'''
The constructor is the place to set up each layer
and activations.
'''
super().__init__()
self.layer_one = torch.nn.Linear(input_dimensions, size)
self.activation_one = torch.nn.ReLU()
self.layer_two = torch.nn.Linear(size, size)
self.activation_two = torch.nn.ReLU()
self.shape_outputs = torch.nn.Linear(size, 1)
def forward(self, inputs):
buffer = self.layer_one(inputs)
buffer = self.activation_one(buffer)
buffer = self.layer_two(buffer)
buffer = self.activation_two(buffer)
buffer = self.shape_outputs(buffer)
return buffer
model = Model(houses[0][0].shape[0])
optimizer = torch.optim.Adam(model.parameters())
loss_function = torch.nn.MSELoss()
#%%
# and now our training loop
number_for_testing = int(len(houses) * 0.05)
number_for_training = len(houses) - number_for_testing
train, test = torch.utils.data.random_split(houses,
[number_for_training, number_for_testing])
training = torch.utils.data.DataLoader(
train, batch_size=64, shuffle=True)
for epoch in range(16):
for inputs, outputs in training:
optimizer.zero_grad()
results = model(inputs)
loss = loss_function(results, outputs)
loss.backward()
optimizer.step()
print("Loss: {0}".format(loss))
#%%
# notice those loss numbers are large, since we are computing
# loss in terms of dollars, and the error involves a square,
# you always
# need to get a sense of error relative to your target numbers
# -- another
# way to think of this would be to create a model
# that gives an output
# on the range 0-1 and multiply that by the
# range of values you
# see in the output say 0-1000000 for houses,
# but as you can see from these
# outputs, our model seems plenty well able to
# learn with large number output
# let's use our test data and see what we get
#%%
# here is our actual , and w
actual = test[0][1]
predicted = model(test[0][0])
actual, predicted
#%%
# wow - that's pretty good for an quick eyeball check, let's
# take a look at the overall error for all our test data
# for this we'll reach back to sklearn and use R^2,
# which gives a score
# of 0-1, one being best, and is a standard method
# to judge the quality
# of a regression model
#%%
import sklearn.metrics
import torch.utils.data
testing = torch.utils.data.DataLoader(
test, batch_size=len(test), shuffle=False)
for inputs, outputs in testing:
predicted = model(inputs).detach().numpy()
actual = outputs.numpy()
print(sklearn.metrics.r2_score(actual, predicted))
#%%
# pretty good, that's actually better than I expected
# when I started
# up this model -- turns out house prices are quite predictable
# in this dataset -- I've actually seen this done
# for local housing prices
# by some of my co workers when they were moving
# to maximize their
# return and minimze their risk --
# a pretty useful application if you
# can get some local MLS data and are planning a move!