|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +""" |
| 4 | +Created on Thu Oct 25 12:06:29 2018 |
| 5 | +
|
| 6 | +PCA and OUTLIER removal functions: Version 1.0 |
| 7 | +
|
| 8 | +The TrainPCA function will apply PCA and identify the right threshold for removing |
| 9 | +outliers |
| 10 | +
|
| 11 | +The RemoveOutliers function uses any monthly/yearly/respresentative dataset, |
| 12 | +the final PCA object trained from the TrainPCA function, and also the threshold |
| 13 | +values from the TrainPCA function to transform the data to PCA domain and apply |
| 14 | +threshold values and remove outliers. |
| 15 | +
|
| 16 | +The clustering function will cluster one month of data and return the labels as |
| 17 | +a numpy array. |
| 18 | +
|
| 19 | +The dimension reduced (using PCA) array is converted into a dataframe to which |
| 20 | +the labels are attached as the last column and written into a hdf5 file in the |
| 21 | +physical hard disk of the computer. |
| 22 | +
|
| 23 | +
|
| 24 | +@author: thileepan |
| 25 | +""" |
| 26 | + |
| 27 | +import pandas as pd |
| 28 | +from sklearn.decomposition import PCA |
| 29 | +from soundapi import SoundAPI |
| 30 | +import numpy as np |
| 31 | +from plotCluster import plotClusters |
| 32 | +from hdbscan import HDBSCAN |
| 33 | +import matplotlib.pyplot as plt |
| 34 | +import os |
| 35 | +import glob |
| 36 | +from natsort import natsorted |
| 37 | +import hdbscan |
| 38 | + |
| 39 | +pattern_list_2016 = ['Librosa_2016_{}_*'.format(num) for num in range(4,13)] #creating pattern lists for 2016 |
| 40 | +pattern_list_2017 = ['Librosa_2017_{}_*'.format(num) for num in range(1,6)] #creating pattern lists for 2017 |
| 41 | +pattern_list = pattern_list_2016 + pattern_list_2017 |
| 42 | + |
| 43 | +def TrainPCA(training_data): |
| 44 | + pca1 = PCA(n_components=2) |
| 45 | + pca1.fit(training_data) |
| 46 | + t1 = pca1.transform(training_data) |
| 47 | + #plt.scatter(t1[:,0], t1[:,1]) |
| 48 | + #plt.figure() |
| 49 | + #plt.hist(t1[:,0], bins=1000) #initial obs t1[:,0] < 3000 |
| 50 | + #plt.figure() |
| 51 | + #plt.hist(t1[:,1], bins=1000) #initial obs -4400 < t1[:,1] <8730 |
| 52 | + mask1 = plt.mlab.find((t1[:,0]<3000) & (t1[:,1]<8730)) |
| 53 | + second_training_set = training_data.iloc[mask1] |
| 54 | + |
| 55 | + pca_final = PCA(n_components=2) |
| 56 | + pca_final.fit(second_training_set) |
| 57 | + t2 = pca_final.transform(second_training_set) |
| 58 | + plt.scatter(t2[:,0], t2[:,1], s=0.01) |
| 59 | + plt.figure() |
| 60 | + plt.hist(t2[:,0], bins=1000) |
| 61 | + plt.figure() |
| 62 | + plt.hist(t2[:,1], bins=1000) |
| 63 | + #mask2 = plt.mlab.find((t2[:,0]<8700) & (t2[:,1]<4000)) |
| 64 | + threshold1 = 8700 |
| 65 | + threshold2 = 4000 |
| 66 | + return (pca_final, threshold1, threshold2) |
| 67 | + |
| 68 | +def ReadMonthlyData(pattern): |
| 69 | + file_list = [] |
| 70 | + for file in glob.glob(pattern): |
| 71 | + file_list.append(file) |
| 72 | + file_list = natsorted(file_list) |
| 73 | + data = pd.DataFrame() |
| 74 | + for file in file_list: |
| 75 | + temp_data = pd.read_hdf(file) |
| 76 | + data = data.append(temp_data) |
| 77 | + return data |
| 78 | + |
| 79 | +def RemoveOutliers(data, pca, th1, th2): |
| 80 | + transformed_data = pca.transform(data) |
| 81 | + #mask = plt.mlab.find(transformed_data[(transformed_data[:,0]<th1) & (transformed_data[:,1]<th2)]) |
| 82 | + mask = plt.mlab.find([(transformed_data[:,0]<th1) & (transformed_data[:,1]<th2)]) |
| 83 | + IL = transformed_data[mask] |
| 84 | + return (IL, mask) |
| 85 | + |
| 86 | +def Clustering(IL, mask, clustering_training_data): |
| 87 | + #ri_test = np.random.choice(range(len(IL)),size=np.int(IL.shape[0]/10)) |
| 88 | + clusterer = HDBSCAN(min_cluster_size=1250, gen_min_span_tree=True) |
| 89 | + #hdb = clusterer.fit(clustering_training_data) |
| 90 | + IL.ix[mask,'hdbscan_cluster'] = clusterer.fit_predict(IL) |
| 91 | + |
| 92 | +# Function to train cluster object and return the object |
| 93 | +def TrainCluster(x): |
| 94 | + clusterer = HDBSCAN(min_cluster_size=1250, gen_min_span_tree=True, prediction_data=True) # creating a clustering object |
| 95 | + hdb = clusterer.fit(x) #Fitting cluster object on training data |
| 96 | + hdb.prediction_data |
| 97 | + del(x) |
| 98 | + return hdb |
| 99 | + |
| 100 | +def ClusterOneMonthData(month, name): |
| 101 | + ## Month should be an integer (0 = Apr_2016, 8= dec_2016, 9 = jan_2017, 13 = May_2017) |
| 102 | + ## Name should be a string, with which we should save the dataframe |
| 103 | + os.chdir('C:/Users/tpaulraj/Projects/Clustering/features/') #Windows |
| 104 | + One_Month_Data = ReadMonthlyData(pattern_list[month]) #Reading one month data using the pattern list |
| 105 | + IL_One_Month_Data, monthly_data_mask = RemoveOutliers(One_Month_Data, pca_final, th1, th2) #Removing outliers from |
| 106 | + ##one month data |
| 107 | + labels, strengths = hdbscan.approximate_predict(hdb, IL_One_Month_Data) # predicting labels and strengths |
| 108 | + ## for one month |
| 109 | + |
| 110 | + ##Writing the two PCs, labels and strengths into a dataframe and storing it as hdf5 file |
| 111 | + os.chdir('C:/Users/tpaulraj/Projects/Clustering/Results/') |
| 112 | + pca_dataframe = pd.DataFrame(IL_One_Month_Data) |
| 113 | + pca_dataframe['labels'] = labels |
| 114 | + pca_dataframe['strengths'] = strengths |
| 115 | + pca_dataframe.columns = ['pca1', 'pca2', 'labels', 'strengths'] |
| 116 | + pca_dataframe.to_hdf(path_or_buf= '{}_clusters'.format(name), key='pca_dataframe') |
| 117 | + One_Month_Data.iloc[monthly_data_mask].to_csv(path_or_buf = '{}_feature_data'.format(name), header = True, index= True) |
| 118 | + del(IL_One_Month_Data, One_Month_Data, labels, monthly_data_mask, pca_dataframe, strengths) |
| 119 | + #print('Length of one month data is {} \n'.format(len(One_Month_Data)), 'Lenght of Cluster data {} \n'.format(len(IL_One_Month_Data)), |
| 120 | + # 'Length of One Month Feature data is {} \n'. format(len(One_Month_Data.iloc[monthly_data_mask]))) |
| 121 | + |
| 122 | + |
| 123 | + |
| 124 | +##Main program |
| 125 | + |
| 126 | + #Reading Training Data |
| 127 | + |
| 128 | +os.chdir('/home/thileepan/Projects/Clustering/features') #Linux |
| 129 | +os.chdir('C:/Users/tpaulraj/Projects/Clustering/features/') #Windows |
| 130 | + |
| 131 | +training_data = pd.read_csv('training_data.csv', index_col=0) # Reading training data |
| 132 | +pca_final, th1, th2 = TrainPCA(training_data) |
| 133 | +IL_training_data, training_data_mask = RemoveOutliers(training_data, pca_final, th1, th2) |
| 134 | +hdb = TrainCluster(IL_training_data) #obtaining trainined cluster object |
| 135 | +del(training_data, training_data_mask, IL_training_data) |
| 136 | + |
| 137 | + |
| 138 | +ClusterOneMonthData(month, name) |
| 139 | + |
| 140 | + |
| 141 | +##-------------------------EXCLUDED PART--------------------------------------------------------------------- |
| 142 | + #labels, strengths = hdbscan.approximate_predict(hdb, IL_One_Month_Data[0:len(IL_One_Month_Data)/4, :]) |
| 143 | + #labels, strengths = hdbscan.approximate_predict(hdb, IL_One_Month_Data) |
| 144 | + ##CPU times: user 56 s, sys: 1.38 s, total: 57.4 s |
| 145 | + ## Wall time: 57.5 s |
| 146 | + #One_Month_Data.ix[mask, 'clusters'] = labels |
| 147 | + #One_Month_Data.isnull().sum() |
| 148 | + #One_Month_Data = One_Month_Data.dropna(inplce=True) |
| 149 | +##------------------------------------------------------------------------------------------------------------ |
| 150 | + |
| 151 | + |
| 152 | +##Program to combine features and clustered data into a single dataframe |
| 153 | + |
| 154 | +os.chdir('/media/thileepan/USB DISK/Results') |
| 155 | + |
| 156 | +features_list = pd.date_range('2016-04', '2017-06', freq='M').strftime("%B_%Y_feature_data") |
| 157 | +clusters_list = pd.date_range('2016-04', '2017-06', freq='M').strftime("%B_%Y_clusters") |
| 158 | +full_year_features = pd.DataFrame() |
| 159 | + |
| 160 | + |
| 161 | +for i in range(len(features_list)): |
| 162 | + features_dataframe = pd.read_csv(features_list[i], parse_dates=[0]) |
| 163 | + cluster_dataframe = pd.read_hdf(clusters_list[i]) |
| 164 | + features_clusters = pd.concat([features_dataframe, cluster_dataframe], axis=1) |
| 165 | + full_year_features = full_year_features.append(features_clusters.sample(frac=0.10)) |
| 166 | + |
| 167 | +full_year_features.set_index(keys='Unnamed: 0', inplace=True) |
| 168 | +full_year_features.index.names = ['timestamp'] |
| 169 | + |
| 170 | + |
| 171 | +## Plotting the clustered data points |
| 172 | +SA = SoundAPI() |
| 173 | +SA.load('Auris3_indexall2.csv') |
| 174 | + |
| 175 | +data = pd.read_csv('features_and_clusters_data', index_col=0, parse_dates=True) |
| 176 | + |
| 177 | +plotClusters(data, pc=data.values, pc1= 65, pc2=66, clusterfieldname='labels', SA=SA, pointsize = 1) |
| 178 | + |
| 179 | + |
| 180 | + |
0 commit comments