-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster.py
168 lines (136 loc) · 7.59 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""
Main
@author Fabio Quattrini
"""
import pandas as pd
import time
from datetime import datetime
import numpy as np
from sklearn import preprocessing
import argparse
import datetime as dt
import STBOF
import STBDBCAN
def cluster():
# ------------ Retrieving Parameters ------------
parser = argparse.ArgumentParser(description='Spatio-Temporal Behavioral Density-based Clustering of Applications '
'with Noise (ST-BDBCAN) using Spatio-Temporal Behavioral Outlier '
'Factor (ST-BOF)')
parser.add_argument('-d', '--distances', type=str, help="File with pairwise distances (.csv).", required=True)
parser.add_argument('-f', '--file', type=str, help="Dataset (.csv).", required=True)
parser.add_argument('-b', '--behavioral', type=str, nargs='+', help='Names of Behavioral attributes', required=True)
parser.add_argument('-minPts', type=int, help='Number of neighboring points considered for ST-BOF', required=True)
parser.add_argument('-k', type=int, help='Rank of neighbor used to define the k-distance', required=True)
parser.add_argument('-pct', type=float, help='Percentage of variation accepted in ST-Behavioral Reachable Density',
required=True)
parser.add_argument('-stbdbcan_minPts', type=int, help='Number of neighboring points considered for ST-BDBCAN',
required=True)
parser.add_argument('-minPts_cluster', type=int, help='Minimum cluster cardinality', required=True)
parser.add_argument('-bw', '--bweights', nargs='+', type=float, help='Behavioral attributes Weights',
required=False)
parser.add_argument('-mnp', '--minNoisePercentage', type=float, help='Minimum % of noise points expected',
default=1, required=False)
parser.add_argument('-sw', '--sweights', type=float, help='Spatial attributes Weights', default=1, required=False)
parser.add_argument('-tw', '--tweights', type=float, help='Temporal attribute Weights', default=1, required=False)
parser.add_argument('-t', '--temporal', type=str, help="Name of sensor for temporal mode", required=False)
args = parser.parse_args()
distances_file = args.distances
data_file = args.file
behavioral_attributes = args.behavioral
id_temporal = args.temporal
min_noise_percentage = args.minNoisePercentage
beta_s = args.sweights
gamma_t = args.tweights
if args.bweights is not None:
if len(args.bweights) != len(args.behavioral):
raise ValueError("The number of behavioral features and behavioral weights must be equal")
alpha_b = args.bweights
else:
alpha_b = np.ones(len(args.behavioral))
minPts = args.minPts
k = args.k
pct = args.pct
stbdbcan_minPts = args.stbdbcan_minPts
minPts_cluster = args.minPts_cluster
# ------------ Load and preprocess Pairwise Distances ------------
# Distances file format: id1, id2, dist
# Loading and preparing the data
spatial_distances = pd.read_csv(distances_file)
# Substitution of the labels with numerical IDs
le = preprocessing.LabelEncoder()
le.fit(spatial_distances['id1'])
spatial_distances['id1'] = le.transform(spatial_distances['id1'])
spatial_distances['id2'] = le.transform(spatial_distances['id2'])
# ST-BOF uses a numpy array that contains in position [id1, id2] the distance of entity id1 and id2
spatial_distances_np = spatial_distances.pivot(index='id1', columns='id2', values='dist').fillna(0).to_numpy()
spatial_distances_np += spatial_distances_np.transpose()
# ------------ Load and preprocess the Dataset ------------
# Dataset tuples structure: [id, datetime, behavioral_attribute1, behavioral_attribute2 ...]
# Loading and preparing the observations
data = pd.read_csv(data_file)
# data = data[:400]
# If temporal mode is selected select the specified entity
if id_temporal is not None:
data = data[data['id'] == id_temporal]
data['id'] = le.transform(data['id'])
# Scale the values from 0 to 1, to equally weight flow and speed
min_max_scaler = preprocessing.MinMaxScaler()
data[behavioral_attributes] = min_max_scaler.fit_transform(data[behavioral_attributes])
# Converting the time column from datetime to epoch
data['time'] = (pd.to_datetime(data['time']) - dt.datetime(1970, 1, 1)).dt.total_seconds()
# ------------ ST-BOF ------------
stbof = STBOF.STBOF(data=data, min_pts=minPts, k=k, spatial_distances=spatial_distances_np,
alpha_b=alpha_b, beta_s=beta_s, gamma_t=gamma_t)
# String containing a duplicate of the screen output to be saved in a txt file
r = ''
msg = ""
if id_temporal is not None:
msg = "Temporal mode selected, analyzing entity {}\n".format(id_temporal)
msg += "Using " + str(len(data)) + " points\nDistance functions features weights: Spatial: " + str(beta_s) + \
", Temporal: " + str(gamma_t) + ", Behavioral: " + str(alpha_b) + "\nComputing ST-Behavioral Outlier Factor\n"
msg += "Parameters: minPts={}, k={}".format(minPts, k)
print(msg)
r += msg + "\n"
start = datetime.now()
stbof.fit_predict()
msg = "Done. Elapsed time: {} \n------------------------------------------------".format(datetime.now() - start)
print(msg)
r += msg + "\n"
# ------------ ST-BDBCAN ------------
# Computing the ST-BOF Upper Bound, over which a point is termed as a spatio-temporal outlier
# The upper bound is set as the ST-BOF of the specified percentile (1-percentile point by default)
st_bofub_index = round(len(data) * min_noise_percentage / 100)
st_bofub = np.sort(stbof.outlier_factor_)[-st_bofub_index]
msg = "Setting the ST-BOFUB as the ST-BOF of point {}, Minimum Noise Percentage = {}%\nCurrent ST-BOFUB = {}" \
"\nComputing ST-BDBCAN...\nParameters: ST-BOFUB={}, pct={}, BDBCAN_minPts={}, minPts_cluster={}".format(
st_bofub_index, min_noise_percentage, st_bofub, st_bofub, pct, stbdbcan_minPts, minPts_cluster)
print(msg)
r += msg + "\n"
# Sleep used to stabilize the progress bar inside STBDBCAN
time.sleep(2)
start = datetime.now()
stbdbcan = STBDBCAN.STBDBCAN(data=data, st_bofub=st_bofub, pct=pct, stbdbcan_minPts=stbdbcan_minPts,
minPts_cluster=minPts_cluster, stbof=stbof, spatial_distances=spatial_distances_np,
beta_s=1, gamma_t=1)
stbdbcan.cluster()
noise_pts_number = len(stbdbcan.labels_[stbdbcan.labels_ == -1])
msg = "Done. Elapsed time: {}\n------------------------------------------------\nNumber of clusters = {}\n" \
"Unclassified: {}\nNoise:{}\n------------\nPercentage of noise points = {}".format(
datetime.now() - start, np.max(stbdbcan.labels_), len(stbdbcan.labels_[stbdbcan.labels_ == -2]),
noise_pts_number, noise_pts_number * 100 / len(data))
print(msg)
r += msg + "\n"
# ------------ Reverting back the preprocessing and saving results ------------
data['id'] = le.inverse_transform(data['id'])
data[behavioral_attributes] = min_max_scaler.inverse_transform(data[behavioral_attributes])
data["time"] = pd.to_datetime(data["time"], unit='s')
data["clusterID"] = stbdbcan.labels_
to_save_fileName = data_file[:-4] + "_results"
if id_temporal is not None:
to_save_fileName += "_{}".format(id_temporal)
data.to_csv(to_save_fileName + ".csv", index=False)
text_file = open(to_save_fileName + ".txt", "w")
text_file.write(r)
text_file.close()
if __name__ == '__main__':
cluster()