Skip to content

Commit 6f7a2ef

Browse files
committed
Initial commit
First commit
0 parents  commit 6f7a2ef

File tree

2 files changed

+82
-0
lines changed

2 files changed

+82
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Created on Thu Oct 25 12:06:29 2018
5+
6+
PCA and OUTLIER removal functions
7+
8+
The TrainPCA function will apply PCA and identify the right threshold for removing
9+
outliers
10+
11+
The RemoveOutliers function uses any monthly/yearly/respresentative dataset,
12+
the final PCA object trained from the TrainPCA function, and also the threshold
13+
values from the TrainPCA function to transform the data to PCA domain and apply
14+
threshold values and remove outliers.
15+
16+
17+
@author: thileepan
18+
"""
19+
20+
import pandas as pd
21+
from sklearn.decomposition import PCA
22+
from soundapi import SoundAPI
23+
import numpy as np
24+
from plotCluster import plotClusters
25+
import hdbscan
26+
import matplotlib.pyplot as plt
27+
import os
28+
import glob
29+
from natsort import natsorted
30+
31+
32+
os.chdir('/home/thileepan/Projects/Clustering/features')
33+
34+
training_data = pd.read_csv('training_data.csv', index_col=0)
35+
36+
def TrainPCA(training_data):
37+
pca1 = PCA(n_components=2)
38+
pca1.fit(training_data)
39+
t1 = pca1.transform(training_data)
40+
#plt.scatter(t1[:,0], t1[:,1])
41+
#plt.figure()
42+
#plt.hist(t1[:,0], bins=1000) #initial obs t1[:,0] < 3000
43+
#plt.figure()
44+
#plt.hist(t1[:,1], bins=1000) #initial obs -4400 < t1[:,1] <8730
45+
mask1 = plt.mlab.find((t1[:,0]<3000) & (t1[:,1]<8730))
46+
second_training_set = training_data.iloc[mask1]
47+
48+
pca_final = PCA(n_components=2)
49+
pca_final.fit(second_training_set)
50+
t2 = pca_final.transform(second_training_set)
51+
plt.scatter(t2[:,0], t2[:,1], s=0.01)
52+
plt.figure()
53+
plt.hist(t2[:,0], bins=1000)
54+
plt.figure()
55+
plt.hist(t2[:,1], bins=1000)
56+
#mask2 = plt.mlab.find((t2[:,0]<8700) & (t2[:,1]<4000))
57+
threshold1 = 8700
58+
threshold2 = 4000
59+
return (pca_final, threshold1, threshold2)
60+
61+
def RemoveOutliers(data, pca, th1, th2):
62+
transformed_data = pca.transform(data)
63+
#mask = plt.mlab.find(transformed_data[(transformed_data[:,0]<th1) & (transformed_data[:,1]<th2)])
64+
mask = plt.mlab.find([(transformed_data[:,0]<th1) & (transformed_data[:,1]<th2)])
65+
IL = transformed_data[mask]
66+
return (IL, mask)
67+
68+
pattern_list_2016 = ['Librosa_2016_{}_*'.format(num) for num in range(4,13)]
69+
pattern_list_2017 = ['Librosa_2017_{}_*'.format(num) for num in range(1,6)]
70+
71+
def ReadMonthlyData(pattern):
72+
file_list = []
73+
for file in glob.glob(pattern):
74+
file_list.append(file)
75+
file_list = natsorted(file_list)
76+
data = pd.DataFrame()
77+
for file in file_list:
78+
temp_data = pd.read_hdf(file)
79+
data = data.append(temp_data)
80+
return data
81+

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Clustering

0 commit comments

Comments
 (0)