|
| 1 | +import matplotlib.pyplot as plt |
| 2 | +import pandas as pd |
| 3 | +import numpy as np |
| 4 | +import statistics |
| 5 | + |
| 6 | + |
| 7 | +# -------------------------------------------------------------------------------------------------------- # |
| 8 | +# A script using linear regression to estimate the grades of students in G3 based on their results in G1 # |
| 9 | +# and G2 as well as their absences during the academic year, their failures and the time studied per week. # |
| 10 | +# # |
| 11 | +# This script uses the following dataset: # |
| 12 | +# https://archive.ics.uci.edu/ml/datasets/Student+Performance # |
| 13 | +# -------------------------------------------------------------------------------------------------------- # |
| 14 | + |
| 15 | + |
| 16 | +def read_data(filename): |
| 17 | + """ |
| 18 | + Function for reading the CSV-file and dropping all columns that aren't important for our purpose. |
| 19 | + :param filename: String |
| 20 | + :return: DataFrame |
| 21 | + """ |
| 22 | + dat = pd.read_csv(filename, sep=";") |
| 23 | + dat = dat[["G1", "G2", "studytime", "failures", "absences", "G3"]] |
| 24 | + return dat |
| 25 | + |
| 26 | + |
| 27 | +def r_squared(pred, res): |
| 28 | + """ |
| 29 | + Calculating the R² score of this model. |
| 30 | + Value returned is between 0.0 and 1.0, the higher the better. |
| 31 | + :param pred: List<Int> |
| 32 | + :param res: List<Int> |
| 33 | + :return: Float |
| 34 | + """ |
| 35 | + ss_t = 0 |
| 36 | + ss_r = 0 |
| 37 | + |
| 38 | + for i in range(len(pred)): |
| 39 | + ss_t += (res[i] - statistics.mean(res)) ** 2 |
| 40 | + ss_r += (res[i] - pred[i]) ** 2 |
| 41 | + |
| 42 | + return 1 - (ss_r / ss_t) |
| 43 | + |
| 44 | + |
| 45 | +def rmse(pred, res): |
| 46 | + """ |
| 47 | + Calculating the Root Mean Square Error. |
| 48 | + The lower the returned value, the better. |
| 49 | + :param pred: List<Int> |
| 50 | + :param res: List<Int> |
| 51 | + :return: Float |
| 52 | + """ |
| 53 | + rmse = 0 |
| 54 | + for i in range(len(pred)): |
| 55 | + rmse += (res[i] - pred[i]) ** 2 |
| 56 | + return np.sqrt(rmse / len(pred)) |
| 57 | + |
| 58 | + |
| 59 | +def get_cost(X, y, theta): |
| 60 | + """ |
| 61 | + Getting the cost using the current values of theta. |
| 62 | + :param X: numpy.ndarray |
| 63 | + :param y: numpy.ndarray |
| 64 | + :param theta: numpy.ndarray |
| 65 | + :return: Float |
| 66 | + """ |
| 67 | + cost = np.power(((X @ theta.T)-y), 2) |
| 68 | + return np.sum(cost)/(2 * len(X)) |
| 69 | + |
| 70 | + |
| 71 | +def gradient_descent(X, y, theta, iterations, alpha): |
| 72 | + """ |
| 73 | + Optimizing the values of theta using gradient descent. |
| 74 | + :param X: numpy.ndarray |
| 75 | + :param y: numpy.ndarray |
| 76 | + :param theta: numpy.ndarray |
| 77 | + :param iterations: Integer |
| 78 | + :param alpha: Integer |
| 79 | + :return: numpy.ndarray, numpy.ndarray |
| 80 | + """ |
| 81 | + cost = np.zeros(iterations) |
| 82 | + for i in range(iterations): |
| 83 | + theta = theta - (alpha / len(X)) * np.sum(X * ((X @ theta.T) - y), axis=0) |
| 84 | + cost[i] = get_cost(X, y, theta) |
| 85 | + return theta, cost |
| 86 | + |
| 87 | + |
| 88 | +data = read_data("student-mat.csv") |
| 89 | + |
| 90 | +# Splitting the data in two batches. |
| 91 | +# 70% training data, 30% test data |
| 92 | +train = data.sample(frac=0.7) |
| 93 | +test = data.drop(train.index) |
| 94 | + |
| 95 | +# Preparing 2 numpy arrays. |
| 96 | +# X will hold all data except G3 and y only holds G3 |
| 97 | +X = train.iloc[:, :5] |
| 98 | +ones = np.ones([X.shape[0], 1]) |
| 99 | +X = np.concatenate((ones, X), axis=1) |
| 100 | + |
| 101 | +y = train.iloc[:, -1:].values |
| 102 | + |
| 103 | +# Initializing theta |
| 104 | +theta = np.zeros([1, 6]) |
| 105 | + |
| 106 | +# Setting hyper parameters |
| 107 | +alpha = 0.00001 |
| 108 | +iterations = 5000 |
| 109 | + |
| 110 | +# Training the model. |
| 111 | +# This means optimizing the cost via gradient descent and calculating the final cost. |
| 112 | +theta, cost = gradient_descent(X, y, theta, iterations, alpha) |
| 113 | +final_cost = get_cost(X, y, theta) |
| 114 | + |
| 115 | +# Plotting the cost in relation to the iteration |
| 116 | +fig, ax = plt.subplots() |
| 117 | +ax.plot(np.arange(iterations), cost, 'r') |
| 118 | +ax.set_xlabel('Iterations') |
| 119 | +ax.set_ylabel('Cost') |
| 120 | +ax.set_title('Error vs. Training Epoch') |
| 121 | +plt.show() |
| 122 | + |
| 123 | +print("Final cost: ", final_cost) |
| 124 | + |
| 125 | +# Initializing the test set |
| 126 | +X_test = test.iloc[:, :5].values.tolist() |
| 127 | + |
| 128 | +y_test = test.iloc[:, -1:].values |
| 129 | + |
| 130 | +theta = theta.tolist() |
| 131 | + |
| 132 | +# Transforming y_test from [[10],[4],...,[20]] to a simple list [10, 4, ..., 20] |
| 133 | +store = [] |
| 134 | +for entry in y_test.tolist(): |
| 135 | + store.append(entry[0]) |
| 136 | + |
| 137 | +y_test = store.copy() |
| 138 | + |
| 139 | +# Calculating predictions using the function theta1 + (theta2 * x1) + ... + (theta6 * x5) |
| 140 | +predictions = [] |
| 141 | +for line in X_test: |
| 142 | + prediction = round(theta[0][0] + (theta[0][1]*line[0]) + (theta[0][2]*line[1]) + (theta[0][3]*line[2]) + \ |
| 143 | + (theta[0][4] * line[3]) + (theta[0][5]*line[4])) |
| 144 | + |
| 145 | + predictions.append(prediction) |
| 146 | + |
| 147 | +# Printing the score of the model |
| 148 | +print("RMSE-Score: ", rmse(predictions, y_test)) |
| 149 | +print("R²-Score:", r_squared(predictions, y_test)) |
| 150 | + |
0 commit comments