ZTECH10
diff --git a/‎DataTransformation.py
Lines changed: 70 additions & 0 deletions b/‎DataTransformation.py
Lines changed: 70 additions & 0 deletions
diff --git a/‎FrequencyAbstraction.py
Lines changed: 83 additions & 0 deletions b/‎FrequencyAbstraction.py
Lines changed: 83 additions & 0 deletions
diff --git a/‎ImputationMissingValues.py
Lines changed: 21 additions & 0 deletions b/‎ImputationMissingValues.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎OutlierDetection.py
Lines changed: 67 additions & 0 deletions b/‎OutlierDetection.py
Lines changed: 67 additions & 0 deletions
@@ -0,0 +1,70 @@
+# Based on  reading the book  "Machine Learning for the Quantified Self: On the Art of Learning from Sensory Data"
+
+from sklearn.decomposition import PCA, FastICA
+
+import util
+
+
+class PrincipalComponentAnalysis:
+
+    pca = []
+
+    def __init__(self):
+        self.pca = []
+
+    # Perform the PCA on the selected columns and return the explained variance.
+    def determine_pc_explained_variance(self, data_table, cols):
+        # Normalize the data first.
+        dt_norm = util.normalize_dataset(data_table, cols)
+
+        # perform the PCA.
+        self.pca = PCA(n_components=len(cols))
+        self.pca.fit(dt_norm[cols])
+        # And return the explained variances.
+        return self.pca.explained_variance_ratio_
+
+    # Apply a PCA given the number of components we have selected.
+    # We add new pca columns.
+    def apply_pca(self, data_table, cols, number_comp):
+        # Normalize the data first.
+        dt_norm = util.normalize_dataset(data_table, cols)
+
+        # perform the PCA.
+        self.pca = PCA(n_components=number_comp)
+        self.pca.fit(dt_norm[cols])
+
+        # Transform our old values.
+        new_values = self.pca.transform(dt_norm[cols])
+
+        # And add the new ones:
+        for comp in range(0, number_comp):
+            data_table['pca_' + str(comp+1)] = new_values[:, comp]
+
+        return data_table
+
+
+class IndependentComponentAnalysis:
+    # r44c805292efc-1
+   # source : https: // scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html
+
+    def __init__(self):
+        self.ica = []
+
+    # Apply a FastICA given the number of components we have selected.
+    # We add new FastICA columns.
+    def apply_fast_ica(self, data_table, cols, number_comp):
+        # Normalize the data first.
+        dt_norm = util.normalize_dataset(data_table, cols)
+
+        # perform the FastICA for all components.
+        self.ica = FastICA(n_components=number_comp)
+        self.ica.fit(dt_norm[cols])
+
+        # Transform our old values.
+        new_values = self.ica.transform(dt_norm[cols])
+
+        # And add the new ones:
+        for comp in range(0, number_comp):
+            data_table['FastICA_' + str(comp+1)] = new_values[:, comp]
+
+        return data_table
@@ -0,0 +1,83 @@
+# Based on  reading the book  "Machine Learning for the Quantified Self: On the Art of Learning from Sensory Data"
+import numpy as np
+import pandas as pd
+
+# This class performs a Fourier transformation on the data to find frequencies that occur
+# often and filter noise.
+
+
+class FourierTransformation:
+
+    def __init__(self):
+        self.temp_list = []
+        self.freqs = None
+
+    # Find the amplitudes of the different frequencies using a fast fourier transformation. Here,
+    # the sampling rate expresses
+    # the number of samples per second (i.e. Frequency is Hertz of the dataset).
+
+    def find_fft_transformation(self, data):
+        # Create the transformation, this includes the amplitudes of both the real
+        # and imaginary part.
+        # print(data.shape)
+        transformation = np.fft.rfft(data, len(data))
+        # real
+        real_ampl = transformation.real
+        # max
+        max_freq = self.freqs[np.argmax(real_ampl[0:len(real_ampl)])]
+        # weigthed
+        freq_weigthed = float(
+            np.sum(self.freqs * real_ampl)) / np.sum(real_ampl)
+
+        # pse
+        PSD = np.divide(np.square(real_ampl), float(len(real_ampl)))
+        PSD_pdf = np.divide(PSD, np.sum(PSD))
+
+        # Make sure there are no zeros.
+        if np.count_nonzero(PSD_pdf) == PSD_pdf.size:
+            pse = -np.sum(np.log(PSD_pdf) * PSD_pdf)
+        else:
+            pse = 0
+
+        real_ampl = np.insert(real_ampl, 0, max_freq)
+        real_ampl = np.insert(real_ampl, 0, freq_weigthed)
+        row = np.insert(real_ampl, 0, pse)
+
+        self.temp_list.append(row)
+
+        return 0
+
+    # Get frequencies over a certain window.
+    def abstract_frequency(self, data_table, columns, window_size, sampling_rate):
+        self.freqs = (sampling_rate *
+                      np.fft.rfftfreq(int(window_size))).round(3)
+
+        for col in columns:
+            collist = []
+            # prepare column names
+            collist.append(col + '_max_freq')
+            collist.append(col + '_freq_weighted')
+            collist.append(col + '_pse')
+
+            collist = collist + \
+                [col + '_freq_' + str(freq) + '_Hz_ws_' +
+                 str(window_size) for freq in self.freqs]
+
+            # rolling statistics to calculate frequencies, per window size.
+            # Pandas Rolling method can only return one aggregation value.
+            # Therefore values are not returned but stored in temp class variable 'temp_list'.
+            data_table[col].rolling(
+                window_size + 1).apply(self.find_fft_transformation)
+
+            # Pad the missing rows with nans
+            frequencies = np.pad(np.array(self.temp_list), ((window_size, 0), (0, 0)),
+                                 'constant', constant_values=np.nan)
+
+            # add new freq columns to frame
+            data_table[collist] = pd.DataFrame(
+                frequencies, index=data_table.index)
+
+            # reset temp-storage array
+            del self.temp_list[:]
+
+        return data_table
@@ -0,0 +1,21 @@
+# Based on  reading the book  "Machine Learning for the Quantified Self: On the Art of Learning from Sensory Data"
+
+class ImputationMissingValues:
+
+    # Impute the mean values in case of missing data.
+    def impute_mean(self, dataset, col):
+        dataset[col] = dataset[col].fillna(dataset[col].mean())
+        return dataset
+
+    # Impute the median values in case ff missing data.
+    def impute_median(self, dataset, col):
+        dataset[col] = dataset[col].fillna(dataset[col].median())
+        return dataset
+
+    # Interpolate the dataset based on previous/next values..
+    def impute_interpolate(self, dataset, col):
+        dataset[col] = dataset[col].interpolate()
+        # And fill the initial data points if needed:
+        # method='bfill': Bfill or backward-fill propagates the first observed non-null value backward until another non-null value is met.
+        dataset[col] = dataset[col].fillna(method='bfill')
+        return dataset
@@ -0,0 +1,67 @@
+# # Based on  reading the book  "Machine Learning for the Quantified Self: On the Art of Learning from Sensory Data"
+
+import scipy
+import math
+from sklearn.mixture import GaussianMixture
+import numpy as np
+import pandas as pd
+import util
+import copy
+
+# Class for outlier detection algorithms based on some distribution of the data. They
+# all consider only single points per row (i.e. one column).
+
+
+class DistributionBasedOutlierDetection:
+
+    # Finds outliers in the specified column of datatable and adds a binary column with
+    # the same name extended with '_outlier' that expresses the result per data point.
+    def chauvenet(self, data_table, col):
+        # Taken partly from: https://www.astro.rug.nl/software/kapteyn/
+
+        # Computer the mean and standard deviation.
+        mean = data_table[col].mean()
+        std = data_table[col].std()
+        N = len(data_table.index)
+        criterion = 1.0/(2*N)
+
+        # Consider the deviation for the data points.
+        deviation = abs(data_table[col] - mean)/std
+
+        # Express the upper and lower bounds.
+        low = -deviation/math.sqrt(2)
+        high = deviation/math.sqrt(2)
+        prob = []
+        mask = []
+
+        # Pass all rows in the dataset.
+        for i in range(0, len(data_table.index)):
+            # Determine the probability of observing the point
+            prob.append(
+                1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i])))
+            # And mark as an outlier when the probability is below our criterion.
+            mask.append(prob[i] < criterion)
+        data_table[col + '_outlier'] = mask
+        return data_table
+
+    # Fits a mixture model towards the data expressed in col and adds a column with the probability
+    # of observing the value given the mixture model.
+    def mixture_model(self, data_table, col, n):
+
+        print('Applying mixture models')
+        # Fit a mixture model to our data.
+        data = data_table[data_table[col].notnull()][col]
+        g = GaussianMixture(n_components=n, max_iter=100, n_init=1)
+        reshaped_data = np.array(data.values.reshape(-1, 1))
+        g.fit(reshaped_data)
+
+        # Predict the probabilities
+        probs = g.score_samples(reshaped_data)
+
+        # Create the right data frame and concatenate the two.
+        data_probs = pd.DataFrame(
+            np.power(10, probs), index=data.index, columns=[col+'_mixture'])
+
+        data_table = pd.concat([data_table, data_probs], axis=1)
+
+        return data_table