Skip to content

Commit 167e752

Browse files
authored
all code written so far
All code written so far to wind turbine noise clustering
1 parent 6f7a2ef commit 167e752

15 files changed

+1556
-0
lines changed

LibrosaExtractFeatures.py

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Created on Thu May 3 15:15:24 2018
5+
6+
@author: thileepan
7+
8+
A function for Feature extraction using Librosa
9+
"""
10+
11+
import librosa
12+
import numpy as np
13+
import pandas as pd
14+
from datetime import timedelta
15+
16+
def ExtractFeatures(t):
17+
#----EXTRACTING FEATURES------------
18+
19+
#--------Energy----------------
20+
energy = librosa.feature.rmse(y=t['data'], frame_length = 256000, hop_length=256000)[:,1:-1]
21+
#------melspectogram----------
22+
mel_spectrum = librosa.feature.melspectrogram(y=t['data'], sr=t['FS'], n_mels=40,hop_length=256000)[:,1:-1]
23+
#-------MFCC------------------
24+
mfcc= librosa.feature.mfcc(y=t['data'],sr=t['FS'], n_mfcc=13, hop_length=256000)[:,1:-1]
25+
#--------Spec Centroid---------
26+
spec_centr = librosa.feature.spectral_centroid(y=t['data'], sr=t['FS'], hop_length=256000)[:,1:-1]
27+
#--------Spec_bandwidth--------
28+
spec_bandwidth = librosa.feature.spectral_bandwidth(y = t['data'], sr = t['FS'], hop_length=256000)[:,1:-1]
29+
#--------Spec_contrast---------
30+
spec_contrast = librosa.feature.spectral_contrast(y = t['data'], sr = t['FS'], hop_length=256000)[:,1:-1]
31+
#------Spec Rolloff--------------
32+
spec_rolloff = librosa.feature.spectral_rolloff(y=t['data'], sr=t['FS'], hop_length=256000, roll_percent=0.90)[:,1:-1]
33+
#------Tonal Centroid------------
34+
#tonal_centroid = librosa.feature.tonnetz(y=t['data'], sr=t['FS'])
35+
#------ZCR---------------------
36+
zcr = librosa.feature.zero_crossing_rate(y=t['data'], frame_length=25600, hop_length=256000)[:,1:-1]
37+
38+
#-----TIMESTAMP CREATION------
39+
start_timestamp=t['t']+timedelta(seconds=5)
40+
timestamp = pd.date_range(start=start_timestamp,freq='10S', periods=60 )
41+
42+
#----COLUMN NAMES CREATION-----------
43+
energy_col_name = ['Energy']
44+
mel_spectrum_col_names = ['melspectrum_{}'.format(i) for i in range(0, mel_spectrum.shape[0])]
45+
mfcc_feature_col_names = ['mfcc_{}'.format(i) for i in range(0, mfcc.shape[0])]
46+
spec_centr_col_name = ['Spectral_Centroid']
47+
spec_bandwidth_col_name = ['Spectral_Bandwidth']
48+
spec_contrast_band0 = ['Spectral_Contrast_0_200']
49+
spec_contrast_band1 = ['Spectral_Contrast_200_400']
50+
spec_contrast_band2 = ['Spectral_Contrast_400_800']
51+
spec_contrast_band3 = ['Spectral_Contrast_800_1600']
52+
spec_contrast_band4 = ['Spectral_Contrast_1600_3200']
53+
spec_contrast_band5 = ['Spectral_Contrast_3200_6400']
54+
spec_contrast_band6 = ['Spectral_Contrast_6400_12800']
55+
spec_rolloff_col_name = ['Spectral_Rolloff']
56+
#toanl_centroid_col_name = ['Tonal_Centroid']
57+
zcr_col_name = ['Zero_Crossing_Rate']
58+
column_names = [energy_col_name + mel_spectrum_col_names + mfcc_feature_col_names +
59+
spec_centr_col_name + spec_bandwidth_col_name + spec_contrast_band0 +
60+
spec_contrast_band1 + spec_contrast_band2 + spec_contrast_band3 +
61+
spec_contrast_band4 + spec_contrast_band5 + spec_contrast_band6 +
62+
spec_rolloff_col_name + zcr_col_name]
63+
64+
#---CREATING A NUMPY ARRAY OF FEATURES------
65+
numpy_array_of_features = np.vstack((energy, mel_spectrum, mfcc, spec_centr, spec_bandwidth, spec_contrast, spec_rolloff, zcr))
66+
67+
#----CREATING A PANDAS DATAFRAME OF FEATURES----
68+
ten_minutes_features= pd.DataFrame(numpy_array_of_features.T, index=timestamp, columns=column_names)
69+
70+
return ten_minutes_features

PCA_and_Outlier_threshold_finder.py

+180
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Created on Thu Oct 25 12:06:29 2018
5+
6+
PCA and OUTLIER removal functions: Version 1.0
7+
8+
The TrainPCA function will apply PCA and identify the right threshold for removing
9+
outliers
10+
11+
The RemoveOutliers function uses any monthly/yearly/respresentative dataset,
12+
the final PCA object trained from the TrainPCA function, and also the threshold
13+
values from the TrainPCA function to transform the data to PCA domain and apply
14+
threshold values and remove outliers.
15+
16+
The clustering function will cluster one month of data and return the labels as
17+
a numpy array.
18+
19+
The dimension reduced (using PCA) array is converted into a dataframe to which
20+
the labels are attached as the last column and written into a hdf5 file in the
21+
physical hard disk of the computer.
22+
23+
24+
@author: thileepan
25+
"""
26+
27+
import pandas as pd
28+
from sklearn.decomposition import PCA
29+
from soundapi import SoundAPI
30+
import numpy as np
31+
from plotCluster import plotClusters
32+
from hdbscan import HDBSCAN
33+
import matplotlib.pyplot as plt
34+
import os
35+
import glob
36+
from natsort import natsorted
37+
import hdbscan
38+
39+
pattern_list_2016 = ['Librosa_2016_{}_*'.format(num) for num in range(4,13)] #creating pattern lists for 2016
40+
pattern_list_2017 = ['Librosa_2017_{}_*'.format(num) for num in range(1,6)] #creating pattern lists for 2017
41+
pattern_list = pattern_list_2016 + pattern_list_2017
42+
43+
def TrainPCA(training_data):
44+
pca1 = PCA(n_components=2)
45+
pca1.fit(training_data)
46+
t1 = pca1.transform(training_data)
47+
#plt.scatter(t1[:,0], t1[:,1])
48+
#plt.figure()
49+
#plt.hist(t1[:,0], bins=1000) #initial obs t1[:,0] < 3000
50+
#plt.figure()
51+
#plt.hist(t1[:,1], bins=1000) #initial obs -4400 < t1[:,1] <8730
52+
mask1 = plt.mlab.find((t1[:,0]<3000) & (t1[:,1]<8730))
53+
second_training_set = training_data.iloc[mask1]
54+
55+
pca_final = PCA(n_components=2)
56+
pca_final.fit(second_training_set)
57+
t2 = pca_final.transform(second_training_set)
58+
plt.scatter(t2[:,0], t2[:,1], s=0.01)
59+
plt.figure()
60+
plt.hist(t2[:,0], bins=1000)
61+
plt.figure()
62+
plt.hist(t2[:,1], bins=1000)
63+
#mask2 = plt.mlab.find((t2[:,0]<8700) & (t2[:,1]<4000))
64+
threshold1 = 8700
65+
threshold2 = 4000
66+
return (pca_final, threshold1, threshold2)
67+
68+
def ReadMonthlyData(pattern):
69+
file_list = []
70+
for file in glob.glob(pattern):
71+
file_list.append(file)
72+
file_list = natsorted(file_list)
73+
data = pd.DataFrame()
74+
for file in file_list:
75+
temp_data = pd.read_hdf(file)
76+
data = data.append(temp_data)
77+
return data
78+
79+
def RemoveOutliers(data, pca, th1, th2):
80+
transformed_data = pca.transform(data)
81+
#mask = plt.mlab.find(transformed_data[(transformed_data[:,0]<th1) & (transformed_data[:,1]<th2)])
82+
mask = plt.mlab.find([(transformed_data[:,0]<th1) & (transformed_data[:,1]<th2)])
83+
IL = transformed_data[mask]
84+
return (IL, mask)
85+
86+
def Clustering(IL, mask, clustering_training_data):
87+
#ri_test = np.random.choice(range(len(IL)),size=np.int(IL.shape[0]/10))
88+
clusterer = HDBSCAN(min_cluster_size=1250, gen_min_span_tree=True)
89+
#hdb = clusterer.fit(clustering_training_data)
90+
IL.ix[mask,'hdbscan_cluster'] = clusterer.fit_predict(IL)
91+
92+
# Function to train cluster object and return the object
93+
def TrainCluster(x):
94+
clusterer = HDBSCAN(min_cluster_size=1250, gen_min_span_tree=True, prediction_data=True) # creating a clustering object
95+
hdb = clusterer.fit(x) #Fitting cluster object on training data
96+
hdb.prediction_data
97+
del(x)
98+
return hdb
99+
100+
def ClusterOneMonthData(month, name):
101+
## Month should be an integer (0 = Apr_2016, 8= dec_2016, 9 = jan_2017, 13 = May_2017)
102+
## Name should be a string, with which we should save the dataframe
103+
os.chdir('C:/Users/tpaulraj/Projects/Clustering/features/') #Windows
104+
One_Month_Data = ReadMonthlyData(pattern_list[month]) #Reading one month data using the pattern list
105+
IL_One_Month_Data, monthly_data_mask = RemoveOutliers(One_Month_Data, pca_final, th1, th2) #Removing outliers from
106+
##one month data
107+
labels, strengths = hdbscan.approximate_predict(hdb, IL_One_Month_Data) # predicting labels and strengths
108+
## for one month
109+
110+
##Writing the two PCs, labels and strengths into a dataframe and storing it as hdf5 file
111+
os.chdir('C:/Users/tpaulraj/Projects/Clustering/Results/')
112+
pca_dataframe = pd.DataFrame(IL_One_Month_Data)
113+
pca_dataframe['labels'] = labels
114+
pca_dataframe['strengths'] = strengths
115+
pca_dataframe.columns = ['pca1', 'pca2', 'labels', 'strengths']
116+
pca_dataframe.to_hdf(path_or_buf= '{}_clusters'.format(name), key='pca_dataframe')
117+
One_Month_Data.iloc[monthly_data_mask].to_csv(path_or_buf = '{}_feature_data'.format(name), header = True, index= True)
118+
del(IL_One_Month_Data, One_Month_Data, labels, monthly_data_mask, pca_dataframe, strengths)
119+
#print('Length of one month data is {} \n'.format(len(One_Month_Data)), 'Lenght of Cluster data {} \n'.format(len(IL_One_Month_Data)),
120+
# 'Length of One Month Feature data is {} \n'. format(len(One_Month_Data.iloc[monthly_data_mask])))
121+
122+
123+
124+
##Main program
125+
126+
#Reading Training Data
127+
128+
os.chdir('/home/thileepan/Projects/Clustering/features') #Linux
129+
os.chdir('C:/Users/tpaulraj/Projects/Clustering/features/') #Windows
130+
131+
training_data = pd.read_csv('training_data.csv', index_col=0) # Reading training data
132+
pca_final, th1, th2 = TrainPCA(training_data)
133+
IL_training_data, training_data_mask = RemoveOutliers(training_data, pca_final, th1, th2)
134+
hdb = TrainCluster(IL_training_data) #obtaining trainined cluster object
135+
del(training_data, training_data_mask, IL_training_data)
136+
137+
138+
ClusterOneMonthData(month, name)
139+
140+
141+
##-------------------------EXCLUDED PART---------------------------------------------------------------------
142+
#labels, strengths = hdbscan.approximate_predict(hdb, IL_One_Month_Data[0:len(IL_One_Month_Data)/4, :])
143+
#labels, strengths = hdbscan.approximate_predict(hdb, IL_One_Month_Data)
144+
##CPU times: user 56 s, sys: 1.38 s, total: 57.4 s
145+
## Wall time: 57.5 s
146+
#One_Month_Data.ix[mask, 'clusters'] = labels
147+
#One_Month_Data.isnull().sum()
148+
#One_Month_Data = One_Month_Data.dropna(inplce=True)
149+
##------------------------------------------------------------------------------------------------------------
150+
151+
152+
##Program to combine features and clustered data into a single dataframe
153+
154+
os.chdir('/media/thileepan/USB DISK/Results')
155+
156+
features_list = pd.date_range('2016-04', '2017-06', freq='M').strftime("%B_%Y_feature_data")
157+
clusters_list = pd.date_range('2016-04', '2017-06', freq='M').strftime("%B_%Y_clusters")
158+
full_year_features = pd.DataFrame()
159+
160+
161+
for i in range(len(features_list)):
162+
features_dataframe = pd.read_csv(features_list[i], parse_dates=[0])
163+
cluster_dataframe = pd.read_hdf(clusters_list[i])
164+
features_clusters = pd.concat([features_dataframe, cluster_dataframe], axis=1)
165+
full_year_features = full_year_features.append(features_clusters.sample(frac=0.10))
166+
167+
full_year_features.set_index(keys='Unnamed: 0', inplace=True)
168+
full_year_features.index.names = ['timestamp']
169+
170+
171+
## Plotting the clustered data points
172+
SA = SoundAPI()
173+
SA.load('Auris3_indexall2.csv')
174+
175+
data = pd.read_csv('features_and_clusters_data', index_col=0, parse_dates=True)
176+
177+
plotClusters(data, pc=data.values, pc1= 65, pc2=66, clusterfieldname='labels', SA=SA, pointsize = 1)
178+
179+
180+

0 commit comments

Comments
 (0)