Skip to content

Commit d5551f6

Browse files
committed
Added my own implementation of linear regression without using sklearn and made small changes to the decision_tree.py
1 parent 5d80d12 commit d5551f6

File tree

5 files changed

+156
-4
lines changed

5 files changed

+156
-4
lines changed

Decision Tree/decision_tree.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def visualize(classifier):
4646

4747
# Creating the classifier, which will use information gain as attribute selection measure
4848
# and limiting the tree to a maximum depth of 4
49-
classifier = DecisionTreeClassifier(criterion="gini", max_depth=4)
49+
classifier = DecisionTreeClassifier(criterion="entropy", max_depth=4)
5050

5151
classifier = classifier.fit(X_train, y_train)
5252

Decision Tree/tree.png

-971 KB
Loading

Linear_Regression/linear_regression.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def plot_scatter_diagram(data):
2121
:param data: DataFrame
2222
:return: None
2323
"""
24-
att = "G1"
24+
att = "failures"
2525
style.use("ggplot")
2626
pyplot.scatter(data[att], data["G3"])
2727
pyplot.xlabel(att)
@@ -48,7 +48,7 @@ def show_output(predictions, x_test, y_test, linear):
4848
if not predictions[x] == y_test[x]:
4949
err += 1
5050

51-
print("Total Accuracy:", round(linear.score(x_test, y_test) * 100, 2), "% with ", err, "errors. ")
51+
print("Total Accuracy (R²-Score):", linear.score(x_test, y_test))
5252
print(type(y_test), type(predictions))
5353

5454

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import matplotlib.pyplot as plt
2+
import pandas as pd
3+
import numpy as np
4+
import statistics
5+
6+
7+
# -------------------------------------------------------------------------------------------------------- #
8+
# A script using linear regression to estimate the grades of students in G3 based on their results in G1 #
9+
# and G2 as well as their absences during the academic year, their failures and the time studied per week. #
10+
# #
11+
# This script uses the following dataset: #
12+
# https://archive.ics.uci.edu/ml/datasets/Student+Performance #
13+
# -------------------------------------------------------------------------------------------------------- #
14+
15+
16+
def read_data(filename):
17+
"""
18+
Function for reading the CSV-file and dropping all columns that aren't important for our purpose.
19+
:param filename: String
20+
:return: DataFrame
21+
"""
22+
dat = pd.read_csv(filename, sep=";")
23+
dat = dat[["G1", "G2", "studytime", "failures", "absences", "G3"]]
24+
return dat
25+
26+
27+
def r_squared(pred, res):
28+
"""
29+
Calculating the R² score of this model.
30+
Value returned is between 0.0 and 1.0, the higher the better.
31+
:param pred: List<Int>
32+
:param res: List<Int>
33+
:return: Float
34+
"""
35+
ss_t = 0
36+
ss_r = 0
37+
38+
for i in range(len(pred)):
39+
ss_t += (res[i] - statistics.mean(res)) ** 2
40+
ss_r += (res[i] - pred[i]) ** 2
41+
42+
return 1 - (ss_r / ss_t)
43+
44+
45+
def rmse(pred, res):
46+
"""
47+
Calculating the Root Mean Square Error.
48+
The lower the returned value, the better.
49+
:param pred: List<Int>
50+
:param res: List<Int>
51+
:return: Float
52+
"""
53+
rmse = 0
54+
for i in range(len(pred)):
55+
rmse += (res[i] - pred[i]) ** 2
56+
return np.sqrt(rmse / len(pred))
57+
58+
59+
def get_cost(X, y, theta):
60+
"""
61+
Getting the cost using the current values of theta.
62+
:param X: numpy.ndarray
63+
:param y: numpy.ndarray
64+
:param theta: numpy.ndarray
65+
:return: Float
66+
"""
67+
cost = np.power(((X @ theta.T)-y), 2)
68+
return np.sum(cost)/(2 * len(X))
69+
70+
71+
def gradient_descent(X, y, theta, iterations, alpha):
72+
"""
73+
Optimizing the values of theta using gradient descent.
74+
:param X: numpy.ndarray
75+
:param y: numpy.ndarray
76+
:param theta: numpy.ndarray
77+
:param iterations: Integer
78+
:param alpha: Integer
79+
:return: numpy.ndarray, numpy.ndarray
80+
"""
81+
cost = np.zeros(iterations)
82+
for i in range(iterations):
83+
theta = theta - (alpha / len(X)) * np.sum(X * ((X @ theta.T) - y), axis=0)
84+
cost[i] = get_cost(X, y, theta)
85+
return theta, cost
86+
87+
88+
data = read_data("student-mat.csv")
89+
90+
# Splitting the data in two batches.
91+
# 70% training data, 30% test data
92+
train = data.sample(frac=0.7)
93+
test = data.drop(train.index)
94+
95+
# Preparing 2 numpy arrays.
96+
# X will hold all data except G3 and y only holds G3
97+
X = train.iloc[:, :5]
98+
ones = np.ones([X.shape[0], 1])
99+
X = np.concatenate((ones, X), axis=1)
100+
101+
y = train.iloc[:, -1:].values
102+
103+
# Initializing theta
104+
theta = np.zeros([1, 6])
105+
106+
# Setting hyper parameters
107+
alpha = 0.00001
108+
iterations = 5000
109+
110+
# Training the model.
111+
# This means optimizing the cost via gradient descent and calculating the final cost.
112+
theta, cost = gradient_descent(X, y, theta, iterations, alpha)
113+
final_cost = get_cost(X, y, theta)
114+
115+
# Plotting the cost in relation to the iteration
116+
fig, ax = plt.subplots()
117+
ax.plot(np.arange(iterations), cost, 'r')
118+
ax.set_xlabel('Iterations')
119+
ax.set_ylabel('Cost')
120+
ax.set_title('Error vs. Training Epoch')
121+
plt.show()
122+
123+
print("Final cost: ", final_cost)
124+
125+
# Initializing the test set
126+
X_test = test.iloc[:, :5].values.tolist()
127+
128+
y_test = test.iloc[:, -1:].values
129+
130+
theta = theta.tolist()
131+
132+
# Transforming y_test from [[10],[4],...,[20]] to a simple list [10, 4, ..., 20]
133+
store = []
134+
for entry in y_test.tolist():
135+
store.append(entry[0])
136+
137+
y_test = store.copy()
138+
139+
# Calculating predictions using the function theta1 + (theta2 * x1) + ... + (theta6 * x5)
140+
predictions = []
141+
for line in X_test:
142+
prediction = round(theta[0][0] + (theta[0][1]*line[0]) + (theta[0][2]*line[1]) + (theta[0][3]*line[2]) + \
143+
(theta[0][4] * line[3]) + (theta[0][5]*line[4]))
144+
145+
predictions.append(prediction)
146+
147+
# Printing the score of the model
148+
print("RMSE-Score: ", rmse(predictions, y_test))
149+
print("R²-Score:", r_squared(predictions, y_test))
150+

README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ Image taken from [wikimedia](https://commons.wikimedia.org/wiki/File:Linear_regr
3434

3535
Besides predicting the final grade of a student, the linear_regression.py can also plot the relationship between two sets of data.
3636

37-
**Accuracy:** ~75% to ~90%
37+
**Accuracy:** R²-Score of ~0.75 - ~0.9
38+
39+
In the linear_regression directory you can also find the linear_regression_no_lib.py which is my implementation of linear regression without using sklearn.
3840

3941
<a id="knn"></a>
4042
## K-Nearest Neighbour

0 commit comments

Comments
 (0)