-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathw3.py
214 lines (148 loc) · 6.31 KB
/
w3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
def print_result(y, pred_, cost):
fig, ax = plt.subplots(ncols=2, figsize=(10, 4))
sns.scatterplot(range(len(y)), y, color='red', ax=ax[0])
sns.scatterplot(range(len(pred_)), pred_, ax=ax[0])
sns.lineplot(range(len(cost)), cost, ax=ax[1])
ax[0].legend(['Actual Data ', 'Prediction '])
plt.show()
# # f - house prices
# f = x1*w1 + x2*w2 + x3*w3 + x4*w4 + epsilon
# # where x1 - floor number, x2 - square meter, x3 - the district with 5 distinct values just for the example afterwards let it be the distance from the center, x4 - old/new
# # w1,w2,w3 and w4 are the weights that should be calibrated
w1 = 0.8
w2 = 2.2
w3 = 1.2
w4 = 0.9
weights = pd.Series([1.8, 2.2, 1.2, 1.9])
# # suppose we have observed some values of f represented by x1_,x2_,x3_,x4_ and epsilon_
# # initialize the values for the features and the error
# # ------------ your code goes here
size = 1000
x1_ = np.random.randint(low = 1, high = 5, size = size)
x2_ = np.random.randint(low = 15, high = 60, size = size)
x3_ = np.random.randint(low = 1, high = 5, size = size)
x4_ = np.random.randint(low = 1, high = 2, size = size)
epsilon_ = np.random.normal(loc = 0, scale = 1, size = size)
# # end of the code ------------
# # get the sample dataset as a pandas dataframe using this values
# ------------ your code goes here
#//////////////////////////////////////////////////////////////////////////////////////
house_dict = {'floor_number':x1_, 'square meter':x2_,'distance': x3_, 'old/new': x4_}
house = pd.DataFrame(house_dict)
house['target'] = np.dot(house,weights) + epsilon_
y = house.pop('target')
#/////////////////////////////////////////////////////////////////////////////////////
#Normal Equation
new_weights = np.linalg.inv(house.T.dot(house)).dot(house.T).dot(y)
new_house = house.copy(deep = True)
new_house['pred'] = np.dot(new_house, new_weights)
new_house['target'] = y
print("Accuracy is :",r2_score(new_house['target'], new_house['pred']))
print(new_weights,'\n', weights)
#/////////////////////////////////////////////////////////////////////////////////////
# end of the code ------------
# g - is the function that we want to learn from our sample that has a general structure and loss function
# ------------------------------------- specify the general structure
# the exact type of true function
# g = x1_*w1_ + x2_*w2_ + x3_*w3_ + x4_*w4_ # model 1
# # oversimplify the model
# g = x1_*w1_ + x3_*w3_ + x4_*w4_ # model 2
# g = x1_*w1_ + x4_*w4_ # model 3
# # overcomplicate the model
# x1_sq = x1**2
# x2_sq = x2**2
# g = x1_*w1_ + x2_*w2_ + x3_*w3_ + x4_*w4_ + x1_sq*w5_ # model 4
# g = x1_*w1_ + x2_*w2_ + x3_*w3_ + x4_*w4_ + x1_sq*w5_ + x2_sq_*w6_ # model 5
# specify the loss function - rmse
# ------------ your code goes here
cols = ['full_sq', 'life_sq', 'floor', 'big_church_count_5000', 'church_count_5000',
'leisure_count_5000', 'sport_count_5000', 'market_count_500', 'price_doc']
path = 'C:\\Users\\user\\applied statistics and data scince\\ML\\data\\Sber_b_house\\train.csv'
df = pd.read_csv(path, usecols=cols, nrows=1000)
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
df = pd.DataFrame(imputer.fit_transform(df), columns = cols)
y = df.pop('price_doc')
X = df.copy()
# exit()
#///////////////////////////////////////////////////////////////////////////////////////////////
eta = 0.0000005
n_epochs = 500
m = size
stop_ = 1e+3
cost = []
def gradient_descent(X, y, eta, n_epochs, m, stop_, cost):
initial_weights = np.zeros(X.shape[1]) - 200
print(initial_weights)
for epoch in range(n_epochs):
gradients = 2/m * X.T.dot(X.dot(initial_weights) - y)
initial_weights = initial_weights - eta * gradients
errors = y - X.dot(initial_weights)
loss = np.sqrt((errors**2).sum() / 2*m)
cost.append(loss)
if loss < stop_:
exit()
return initial_weights, cost, epoch
return initial_weights, cost, epoch
initial_weights_, cost, epoch = gradient_descent(X, y, eta, n_epochs, m, stop_, cost)
print("Here is the output of the gradients\n ",initial_weights_)
print('Iteration was stoped at :', epoch)
pred_ = X.dot(initial_weights_)
print("Accuracy is :",r2_score(y, pred_))
print_result(y, pred_,cost)
#////////////////////////////////////////////////////////////////////////////////////////////////////////////
#////////////////////////////////////////////////////////////////////////////////////////////////////////////
#////////////////////////////////////////////////////////////////////////////////////////////////////////////
#Stochastick Gradient Descent
path = 'C:\\Users\\user\\applied statistics and data scince\\ML\\data\\Sber_b_house\\train.csv'
df = pd.read_csv(path, usecols=cols, nrows=1000)
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
df = pd.DataFrame(imputer.fit_transform(df), columns = cols)
y = df.pop('price_doc')
X = df.copy()
n_epochs = 100
t0, t1 = 5, 5000000000
def learning_schedule(t):
return t0/(t + t1)
def mse(x,y, theta):
xw = x.dot(theta)
return np.sqrt(((xw - y)**2).sum() / (2*len(y)))
def stochastic_gradient_descent(X, y, n_epochs):
theta = np.random.randn(X.shape[1])
loss = []
m = 200
eta = 0.00000000001
for epoch in range(n_epochs):
for i in range(m):
random_index = np.random.randint(m)
xi = X[random_index:random_index+1]
yi = y[random_index:random_index+1]
gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
eta = learning_schedule(epoch * m + i)
theta = theta - eta * gradients
cost = mse(X, y, theta)
loss.append(cost)
print("Epoch--", epoch, 'ETA--', eta, 'RMSE', cost)
return theta, loss
new_theta, loss = stochastic_gradient_descent(X, y, n_epochs)
pred__ = X.dot(initial_weights_)
print("Accuracy is :",r2_score(y, pred__))
print_result(y, pred__, loss)
# end of the code ------------
# define the gradient descent optimizator for multivariate linear regression problem
# ------------ your code goes here
# end of the code ------------
# train the models, for each of them you should get the model's rmse and parameter values (compare them with the true parameter values)
# ------------ your code goes here
# model 1
# model 2
# model 3
# model 4
# model 5
# end of the code ------------
# we will discuss the final, model validation part, during the upcoming workshops