Skip to content

Commit a8f980b

Browse files
authored
Add files via upload
1 parent ea4ba0f commit a8f980b

9 files changed

+3150
-0
lines changed

DataTransformation.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Based on reading the book "Machine Learning for the Quantified Self: On the Art of Learning from Sensory Data"
2+
3+
from sklearn.decomposition import PCA, FastICA
4+
5+
import util
6+
7+
8+
class PrincipalComponentAnalysis:
9+
10+
pca = []
11+
12+
def __init__(self):
13+
self.pca = []
14+
15+
# Perform the PCA on the selected columns and return the explained variance.
16+
def determine_pc_explained_variance(self, data_table, cols):
17+
# Normalize the data first.
18+
dt_norm = util.normalize_dataset(data_table, cols)
19+
20+
# perform the PCA.
21+
self.pca = PCA(n_components=len(cols))
22+
self.pca.fit(dt_norm[cols])
23+
# And return the explained variances.
24+
return self.pca.explained_variance_ratio_
25+
26+
# Apply a PCA given the number of components we have selected.
27+
# We add new pca columns.
28+
def apply_pca(self, data_table, cols, number_comp):
29+
# Normalize the data first.
30+
dt_norm = util.normalize_dataset(data_table, cols)
31+
32+
# perform the PCA.
33+
self.pca = PCA(n_components=number_comp)
34+
self.pca.fit(dt_norm[cols])
35+
36+
# Transform our old values.
37+
new_values = self.pca.transform(dt_norm[cols])
38+
39+
# And add the new ones:
40+
for comp in range(0, number_comp):
41+
data_table['pca_' + str(comp+1)] = new_values[:, comp]
42+
43+
return data_table
44+
45+
46+
class IndependentComponentAnalysis:
47+
# r44c805292efc-1
48+
# source : https: // scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html
49+
50+
def __init__(self):
51+
self.ica = []
52+
53+
# Apply a FastICA given the number of components we have selected.
54+
# We add new FastICA columns.
55+
def apply_fast_ica(self, data_table, cols, number_comp):
56+
# Normalize the data first.
57+
dt_norm = util.normalize_dataset(data_table, cols)
58+
59+
# perform the FastICA for all components.
60+
self.ica = FastICA(n_components=number_comp)
61+
self.ica.fit(dt_norm[cols])
62+
63+
# Transform our old values.
64+
new_values = self.ica.transform(dt_norm[cols])
65+
66+
# And add the new ones:
67+
for comp in range(0, number_comp):
68+
data_table['FastICA_' + str(comp+1)] = new_values[:, comp]
69+
70+
return data_table

FrequencyAbstraction.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# Based on reading the book "Machine Learning for the Quantified Self: On the Art of Learning from Sensory Data"
2+
import numpy as np
3+
import pandas as pd
4+
5+
# This class performs a Fourier transformation on the data to find frequencies that occur
6+
# often and filter noise.
7+
8+
9+
class FourierTransformation:
10+
11+
def __init__(self):
12+
self.temp_list = []
13+
self.freqs = None
14+
15+
# Find the amplitudes of the different frequencies using a fast fourier transformation. Here,
16+
# the sampling rate expresses
17+
# the number of samples per second (i.e. Frequency is Hertz of the dataset).
18+
19+
def find_fft_transformation(self, data):
20+
# Create the transformation, this includes the amplitudes of both the real
21+
# and imaginary part.
22+
# print(data.shape)
23+
transformation = np.fft.rfft(data, len(data))
24+
# real
25+
real_ampl = transformation.real
26+
# max
27+
max_freq = self.freqs[np.argmax(real_ampl[0:len(real_ampl)])]
28+
# weigthed
29+
freq_weigthed = float(
30+
np.sum(self.freqs * real_ampl)) / np.sum(real_ampl)
31+
32+
# pse
33+
PSD = np.divide(np.square(real_ampl), float(len(real_ampl)))
34+
PSD_pdf = np.divide(PSD, np.sum(PSD))
35+
36+
# Make sure there are no zeros.
37+
if np.count_nonzero(PSD_pdf) == PSD_pdf.size:
38+
pse = -np.sum(np.log(PSD_pdf) * PSD_pdf)
39+
else:
40+
pse = 0
41+
42+
real_ampl = np.insert(real_ampl, 0, max_freq)
43+
real_ampl = np.insert(real_ampl, 0, freq_weigthed)
44+
row = np.insert(real_ampl, 0, pse)
45+
46+
self.temp_list.append(row)
47+
48+
return 0
49+
50+
# Get frequencies over a certain window.
51+
def abstract_frequency(self, data_table, columns, window_size, sampling_rate):
52+
self.freqs = (sampling_rate *
53+
np.fft.rfftfreq(int(window_size))).round(3)
54+
55+
for col in columns:
56+
collist = []
57+
# prepare column names
58+
collist.append(col + '_max_freq')
59+
collist.append(col + '_freq_weighted')
60+
collist.append(col + '_pse')
61+
62+
collist = collist + \
63+
[col + '_freq_' + str(freq) + '_Hz_ws_' +
64+
str(window_size) for freq in self.freqs]
65+
66+
# rolling statistics to calculate frequencies, per window size.
67+
# Pandas Rolling method can only return one aggregation value.
68+
# Therefore values are not returned but stored in temp class variable 'temp_list'.
69+
data_table[col].rolling(
70+
window_size + 1).apply(self.find_fft_transformation)
71+
72+
# Pad the missing rows with nans
73+
frequencies = np.pad(np.array(self.temp_list), ((window_size, 0), (0, 0)),
74+
'constant', constant_values=np.nan)
75+
76+
# add new freq columns to frame
77+
data_table[collist] = pd.DataFrame(
78+
frequencies, index=data_table.index)
79+
80+
# reset temp-storage array
81+
del self.temp_list[:]
82+
83+
return data_table

ImputationMissingValues.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Based on reading the book "Machine Learning for the Quantified Self: On the Art of Learning from Sensory Data"
2+
3+
class ImputationMissingValues:
4+
5+
# Impute the mean values in case of missing data.
6+
def impute_mean(self, dataset, col):
7+
dataset[col] = dataset[col].fillna(dataset[col].mean())
8+
return dataset
9+
10+
# Impute the median values in case ff missing data.
11+
def impute_median(self, dataset, col):
12+
dataset[col] = dataset[col].fillna(dataset[col].median())
13+
return dataset
14+
15+
# Interpolate the dataset based on previous/next values..
16+
def impute_interpolate(self, dataset, col):
17+
dataset[col] = dataset[col].interpolate()
18+
# And fill the initial data points if needed:
19+
# method='bfill': Bfill or backward-fill propagates the first observed non-null value backward until another non-null value is met.
20+
dataset[col] = dataset[col].fillna(method='bfill')
21+
return dataset

OutlierDetection.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# # Based on reading the book "Machine Learning for the Quantified Self: On the Art of Learning from Sensory Data"
2+
3+
import scipy
4+
import math
5+
from sklearn.mixture import GaussianMixture
6+
import numpy as np
7+
import pandas as pd
8+
import util
9+
import copy
10+
11+
# Class for outlier detection algorithms based on some distribution of the data. They
12+
# all consider only single points per row (i.e. one column).
13+
14+
15+
class DistributionBasedOutlierDetection:
16+
17+
# Finds outliers in the specified column of datatable and adds a binary column with
18+
# the same name extended with '_outlier' that expresses the result per data point.
19+
def chauvenet(self, data_table, col):
20+
# Taken partly from: https://www.astro.rug.nl/software/kapteyn/
21+
22+
# Computer the mean and standard deviation.
23+
mean = data_table[col].mean()
24+
std = data_table[col].std()
25+
N = len(data_table.index)
26+
criterion = 1.0/(2*N)
27+
28+
# Consider the deviation for the data points.
29+
deviation = abs(data_table[col] - mean)/std
30+
31+
# Express the upper and lower bounds.
32+
low = -deviation/math.sqrt(2)
33+
high = deviation/math.sqrt(2)
34+
prob = []
35+
mask = []
36+
37+
# Pass all rows in the dataset.
38+
for i in range(0, len(data_table.index)):
39+
# Determine the probability of observing the point
40+
prob.append(
41+
1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i])))
42+
# And mark as an outlier when the probability is below our criterion.
43+
mask.append(prob[i] < criterion)
44+
data_table[col + '_outlier'] = mask
45+
return data_table
46+
47+
# Fits a mixture model towards the data expressed in col and adds a column with the probability
48+
# of observing the value given the mixture model.
49+
def mixture_model(self, data_table, col, n):
50+
51+
print('Applying mixture models')
52+
# Fit a mixture model to our data.
53+
data = data_table[data_table[col].notnull()][col]
54+
g = GaussianMixture(n_components=n, max_iter=100, n_init=1)
55+
reshaped_data = np.array(data.values.reshape(-1, 1))
56+
g.fit(reshaped_data)
57+
58+
# Predict the probabilities
59+
probs = g.score_samples(reshaped_data)
60+
61+
# Create the right data frame and concatenate the two.
62+
data_probs = pd.DataFrame(
63+
np.power(10, probs), index=data.index, columns=[col+'_mixture'])
64+
65+
data_table = pd.concat([data_table, data_probs], axis=1)
66+
67+
return data_table

0 commit comments

Comments
 (0)