|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +""" |
| 4 | +Created on Thu Oct 25 12:06:29 2018 |
| 5 | +
|
| 6 | +PCA and OUTLIER removal functions |
| 7 | +
|
| 8 | +The TrainPCA function will apply PCA and identify the right threshold for removing |
| 9 | +outliers |
| 10 | +
|
| 11 | +The RemoveOutliers function uses any monthly/yearly/respresentative dataset, |
| 12 | +the final PCA object trained from the TrainPCA function, and also the threshold |
| 13 | +values from the TrainPCA function to transform the data to PCA domain and apply |
| 14 | +threshold values and remove outliers. |
| 15 | +
|
| 16 | +
|
| 17 | +@author: thileepan |
| 18 | +""" |
| 19 | + |
| 20 | +import pandas as pd |
| 21 | +from sklearn.decomposition import PCA |
| 22 | +from soundapi import SoundAPI |
| 23 | +import numpy as np |
| 24 | +from plotCluster import plotClusters |
| 25 | +import hdbscan |
| 26 | +import matplotlib.pyplot as plt |
| 27 | +import os |
| 28 | +import glob |
| 29 | +from natsort import natsorted |
| 30 | + |
| 31 | + |
| 32 | +os.chdir('/home/thileepan/Projects/Clustering/features') |
| 33 | + |
| 34 | +training_data = pd.read_csv('training_data.csv', index_col=0) |
| 35 | + |
| 36 | +def TrainPCA(training_data): |
| 37 | + pca1 = PCA(n_components=2) |
| 38 | + pca1.fit(training_data) |
| 39 | + t1 = pca1.transform(training_data) |
| 40 | + #plt.scatter(t1[:,0], t1[:,1]) |
| 41 | + #plt.figure() |
| 42 | + #plt.hist(t1[:,0], bins=1000) #initial obs t1[:,0] < 3000 |
| 43 | + #plt.figure() |
| 44 | + #plt.hist(t1[:,1], bins=1000) #initial obs -4400 < t1[:,1] <8730 |
| 45 | + mask1 = plt.mlab.find((t1[:,0]<3000) & (t1[:,1]<8730)) |
| 46 | + second_training_set = training_data.iloc[mask1] |
| 47 | + |
| 48 | + pca_final = PCA(n_components=2) |
| 49 | + pca_final.fit(second_training_set) |
| 50 | + t2 = pca_final.transform(second_training_set) |
| 51 | + plt.scatter(t2[:,0], t2[:,1], s=0.01) |
| 52 | + plt.figure() |
| 53 | + plt.hist(t2[:,0], bins=1000) |
| 54 | + plt.figure() |
| 55 | + plt.hist(t2[:,1], bins=1000) |
| 56 | + #mask2 = plt.mlab.find((t2[:,0]<8700) & (t2[:,1]<4000)) |
| 57 | + threshold1 = 8700 |
| 58 | + threshold2 = 4000 |
| 59 | + return (pca_final, threshold1, threshold2) |
| 60 | + |
| 61 | +def RemoveOutliers(data, pca, th1, th2): |
| 62 | + transformed_data = pca.transform(data) |
| 63 | + #mask = plt.mlab.find(transformed_data[(transformed_data[:,0]<th1) & (transformed_data[:,1]<th2)]) |
| 64 | + mask = plt.mlab.find([(transformed_data[:,0]<th1) & (transformed_data[:,1]<th2)]) |
| 65 | + IL = transformed_data[mask] |
| 66 | + return (IL, mask) |
| 67 | + |
| 68 | +pattern_list_2016 = ['Librosa_2016_{}_*'.format(num) for num in range(4,13)] |
| 69 | +pattern_list_2017 = ['Librosa_2017_{}_*'.format(num) for num in range(1,6)] |
| 70 | + |
| 71 | +def ReadMonthlyData(pattern): |
| 72 | + file_list = [] |
| 73 | + for file in glob.glob(pattern): |
| 74 | + file_list.append(file) |
| 75 | + file_list = natsorted(file_list) |
| 76 | + data = pd.DataFrame() |
| 77 | + for file in file_list: |
| 78 | + temp_data = pd.read_hdf(file) |
| 79 | + data = data.append(temp_data) |
| 80 | + return data |
| 81 | + |
0 commit comments