-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
64 lines (48 loc) · 2.02 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import math
from sys import float_info
import pandas as pd
import sklearn.utils
from sklearn.model_selection import train_test_split
import numpy as np
from DataSetTypes import DataSetNorm
def normalize_sr_eval(y_i: float) -> float:
j = 0.01
numerator = 1.4
denominator = 1 + math.exp(-j * (y_i + 25))
return numerator / denominator
def inverse_normalized_eval(y_i: float | np.ndarray) -> float | np.ndarray:
y_i = np.minimum(np.maximum(y_i, float_info.epsilon), 1)
j = 0.01
return ((1 / j) * -np.log((1.4 / y_i) - 1)) - 25
def get_data_split(dataset_path: str, dataset_norm: DataSetNorm) \
-> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
"""
:param dataset_path: Path to get the dataset from
:param normalized: Whether to run the SR values through the normalization function
:return: Dataset split into x_train, x_test, y_train, y_test
"""
df = pd.read_csv(dataset_path)
x = df.drop(['eval', 'Unnamed: 0'], axis=1)
y = df['eval']
if dataset_norm is DataSetNorm.NORMALIZED:
y = y.apply(normalize_sr_eval)
# Split the data into training and testing sets
# Format: x_train, x_test, y_train, y_test
return train_test_split(x, y, test_size=0.2)
def unimplemented():
raise Exception("Unimplemented!")
def relevant_cols_from_cv_results(cv_results: sklearn.utils.Bunch) -> pd.DataFrame:
ret = pd.DataFrame()
col_names = [i for i in cv_results.keys() if i.startswith('param_')] + ['mean_test_score', 'std_test_score']
for i in col_names:
ret[i] = cv_results[i]
return ret
def generate_ptp_terms(heights: list[int]) -> pd.DataFrame:
num_columns = 10
df = pd.DataFrame({f'col{i}': heights[i] for i in range(num_columns)}, index=[0])
for width in range(2, num_columns + 1):
for i in range(num_columns - width + 1):
key_name = f'ptp({",".join(f"col{j}" for j in range(i, i + width))})'
col_slice = heights[i:i + width]
df[key_name] = max(col_slice) - min(col_slice)
return df