-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Milestone before writing a paper for SEIM-2018
Code for feature extraction and anomaly detection with LOF and One-class SVM is present, as well as a few helper scripts.
- Loading branch information
Showing
48 changed files
with
2,108 additions
and
136 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
|
||
|
||
def read_data(csv_path): | ||
labels = np.genfromtxt(csv_path, delimiter=',', usecols=[0], dtype=None) | ||
raw_data = np.genfromtxt(csv_path, delimiter=',', comments=None)[:, 1:] | ||
return labels, raw_data | ||
|
||
|
||
def print_plots(inliers, outliers, img_out_path, is_for_methods): | ||
def draw_subplot(x_label, y_label): | ||
plt.subplot(2, 2, draw_subplot.counter) | ||
x_index = columns[x_label] | ||
y_index = columns[y_label] | ||
plt.scatter(inliers[:, x_index], inliers[:, y_index], c='blue', edgecolor='k') | ||
plt.scatter(outliers[:, x_index], outliers[:, y_index], c='red', edgecolor='k') | ||
plt.xlabel(x_label) | ||
plt.ylabel(y_label) | ||
draw_subplot.counter += 1 | ||
|
||
draw_subplot.counter = 1 | ||
if is_for_methods: | ||
columns = {'SLoC': 1, 'AST nodes': 2, 'AST height': 3, 'Loop nesting depth': 4, 'Cyclomatic complexity': 5} | ||
plt.figure(num='Methods', figsize=(12, 8)) | ||
draw_subplot('SLoC', 'AST nodes') | ||
draw_subplot('AST nodes', 'AST height') | ||
draw_subplot('SLoC', 'Cyclomatic complexity') | ||
draw_subplot('Loop nesting depth', 'Cyclomatic complexity') | ||
else: | ||
columns = {'LoC': 1, 'SLoC': 2, 'AST nodes': 3, 'AST height': 4} | ||
plt.figure(num='Files', figsize=(12, 8)) | ||
draw_subplot('LoC', 'SLoC') | ||
draw_subplot('SLoC', 'AST nodes') | ||
draw_subplot('SLoC', 'AST height') | ||
|
||
plt.tight_layout() | ||
plt.savefig(img_out_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
import csv | ||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
import os | ||
import pandas | ||
import time | ||
|
||
# noinspection PyUnresolvedReferences | ||
from mpl_toolkits.mplot3d import Axes3D | ||
from sklearn.decomposition import PCA | ||
from sklearn.model_selection import ParameterGrid | ||
from sklearn.neighbors import LocalOutlierFactor | ||
from sklearn.preprocessing import scale | ||
from sklearn.svm import OneClassSVM | ||
|
||
dataset_name = "top1k" | ||
is_drawing = False | ||
|
||
out_dir = f"../out-data/" | ||
csv_in_path = f"../data/{dataset_name}_methods.csv" | ||
out_path = f"{out_dir}methods" | ||
log_path = f"{out_dir}methods.log" | ||
|
||
if not os.path.exists(out_dir): | ||
os.makedirs(out_dir) | ||
log_file = open(log_path, mode='w+') | ||
|
||
|
||
def log(s): | ||
print(s) | ||
log_file.write(s) | ||
log_file.write('\n') | ||
|
||
|
||
start_time = time.time() | ||
|
||
# Load input | ||
methods = pandas.read_csv(csv_in_path, header=0, delimiter='\t', quoting=csv.QUOTE_NONE, error_bad_lines=True, | ||
engine='python') | ||
|
||
# Fix potential problems in input | ||
X = np.array(methods.values[:, 1:], dtype="float64") | ||
ok_lines = np.array([~np.isnan(row).any() for row in X]) | ||
methods = methods[ok_lines] | ||
X = X[ok_lines] | ||
n_methods = methods.shape[0] | ||
|
||
# Preprocessing | ||
X = scale(X) | ||
# X = PCA(n_components=3).fit_transform(X) | ||
|
||
# All configs | ||
all_clf_configs = [ | ||
{ | ||
'clf_name': 'lof', | ||
'clf': LocalOutlierFactor(n_jobs=-1), | ||
'param_grid': { | ||
'n_neighbors': [10, 5, 2], | ||
'algorithm': ['ball_tree', 'kd_tree'], | ||
'contamination': [0.00005, 0.0001] | ||
} | ||
}, | ||
{ | ||
'clf_name': 'svm', | ||
'clf': OneClassSVM(shrinking=True), | ||
'param_grid': [ | ||
{ | ||
'kernel': ['linear'], | ||
'nu': [0.00005] | ||
}, | ||
{ | ||
'kernel': ['rbf', 'poly'], | ||
'nu': [0.00005, 0.0001], | ||
'gamma': [0.1] | ||
} | ||
] | ||
} | ||
] | ||
# Configs for the current run | ||
clf_configs = [clf_config for clf_config in all_clf_configs if clf_config['clf_name'] in ('lof', 'svm')] | ||
|
||
for clf_config in clf_configs: | ||
clf_name = clf_config['clf_name'] | ||
clf = clf_config['clf'] | ||
param_sets = list(ParameterGrid(clf_config['param_grid'])) | ||
log(clf_name) | ||
|
||
# For calculating 'intersection', i.e. methods marked as anomalous | ||
# by the current classifier with all param sets | ||
all_indices = np.arange(0, n_methods) | ||
intersect_outlier_indices = all_indices | ||
|
||
for params in param_sets: | ||
param_set_desc = str(params) | ||
log(f"\t{param_set_desc}") | ||
|
||
# Fit the model and mark data | ||
clf.set_params(**params) | ||
if clf_name == 'lof': | ||
marks = clf.fit_predict(X) | ||
elif clf_name == 'svm': | ||
clf.fit(X) | ||
# Suppressed warning below: clf is in dictionary | ||
# noinspection PyUnresolvedReferences | ||
marks = clf.predict(X) | ||
else: | ||
log(f"Error: unknown classifier name {clf_name}!") | ||
exit(1) | ||
|
||
# Suppressed warning below: either `marks` is assigned, or the whole program exits with an error | ||
# noinspection PyUnboundLocalVariable | ||
inlier_indices = np.asarray([mark > 0 for mark in marks]) | ||
outlier_indices = np.asarray([mark < 0 for mark in marks]) | ||
intersect_outlier_indices = np.intersect1d(intersect_outlier_indices, all_indices[outlier_indices]) | ||
|
||
X_inliers = X[inlier_indices] | ||
X_outliers = X[outlier_indices] | ||
n_inliers = X_inliers.shape[0] | ||
n_outliers = X_outliers.shape[0] | ||
log(f"\t\tInliers:\t{n_inliers:6}/{n_methods:6}\t{(n_inliers * 100 / n_methods):10.7}%") | ||
log(f"\t\tOutliers:\t{n_outliers:6}/{n_methods:6}\t{(n_outliers * 100 / n_methods):10.7}%") | ||
|
||
if n_outliers > n_inliers: | ||
X_temp = X_inliers | ||
X_inliers = X_outliers | ||
X_outliers = X_temp | ||
log("\t\tSwapped 'inliers' and 'outliers', because there were more outliers than inliers!") | ||
|
||
if is_drawing: | ||
# Show the principal components on 3D plot | ||
fig = plt.figure() | ||
ax = fig.add_subplot(111, projection='3d') | ||
ax.scatter(X_inliers[:, 1], X_inliers[:, 0], X_inliers[:, 2], c='None', edgecolor='blue', marker='o') | ||
ax.scatter(X_outliers[:, 1], X_outliers[:, 0], X_outliers[:, 2], c='red', marker='^') | ||
plt.savefig(f"{out_path} {clf_name} {param_set_desc}.png") | ||
|
||
# Save output of this configuration to file | ||
outlier_names = methods.values[:][outlier_indices] | ||
dataframe = pandas.DataFrame(outlier_names) | ||
dataframe.to_csv(f"{out_path} {clf_name} {param_set_desc}.csv") | ||
|
||
# Save the 'intersection' to file | ||
intersect_outlier_names = methods.values[:, 0][intersect_outlier_indices] | ||
dataframe = pandas.DataFrame(intersect_outlier_names) | ||
dataframe.to_csv(f"{out_path} {clf_name} intersection.csv") | ||
|
||
end_time = time.time() | ||
log(f"Total elapsed time: {end_time - start_time}") | ||
log_file.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
from mpl_toolkits.mplot3d import Axes3D | ||
|
||
csv_path = '../data/6proj_methodMetrics.csv' | ||
csv_marks_path = '../data/6proj_methods_marked_svm.csv' | ||
|
||
raw_data = np.genfromtxt(csv_path, delimiter=',', comments=None)[:, 1:] | ||
marks = np.genfromtxt(csv_marks_path) | ||
marked = np.column_stack((raw_data, marks)) | ||
|
||
inlier_indices = np.asarray([int(row[-1]) > 0 for row in marked]) | ||
outlier_indices = np.asarray([int(row[-1]) < 0 for row in marked]) | ||
inliers = raw_data[inlier_indices] | ||
outliers = raw_data[outlier_indices] | ||
|
||
fig = plt.figure() | ||
ax = fig.add_subplot(111, projection='3d') | ||
|
||
ax.scatter(outliers[:, 0], outliers[:, 1], outliers[:, 4], c='red', marker='^') | ||
# ax.scatter(inliers[:, 0], inliers[:, 1], inliers[:, 4], c='blue', marker='o') | ||
|
||
ax.set_xlabel('SLoC') | ||
ax.set_ylabel('AST nodes') | ||
ax.set_zlabel('Cyclomatic complexity') | ||
|
||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import itertools | ||
import os | ||
import numpy as np | ||
import pandas | ||
|
||
data_path = "../out-data/out_top1k_no_pca" | ||
compared_files = [ | ||
# "methods lof {'algorithm': 'ball_tree', 'contamination': 5e-05, 'n_neighbors': 2}.csv", | ||
# "methods lof {'algorithm': 'ball_tree', 'contamination': 5e-05, 'n_neighbors': 5}.csv", | ||
"methods lof {'algorithm': 'ball_tree', 'contamination': 5e-05, 'n_neighbors': 10}.csv", | ||
# "methods lof {'algorithm': 'kd_tree', 'contamination': 5e-05, 'n_neighbors': 2}.csv", | ||
# "methods lof {'algorithm': 'kd_tree', 'contamination': 5e-05, 'n_neighbors': 5}.csv", | ||
# "methods lof {'algorithm': 'kd_tree', 'contamination': 5e-05, 'n_neighbors': 10}.csv", | ||
] | ||
report_path = "../out-data/report_2.txt" | ||
|
||
|
||
def signature_to_url_and_fname(signature): | ||
signature = signature[6:] # drop 'repos/' | ||
split_1 = signature.split(sep='__', maxsplit=1) | ||
account = split_1[0] | ||
signature = split_1[1] | ||
split_2 = signature.split(sep='.kt:', maxsplit=1) | ||
repo_and_filename = split_2[0] | ||
fun_name = split_2[1] | ||
split_3 = repo_and_filename.split(sep='/', maxsplit=1) | ||
repo = split_3[0] | ||
filepath = split_3[1] | ||
return np.array([f"https://github.com/{account}/{repo}/blob/master/{filepath}.kt", fun_name]) | ||
|
||
|
||
sets = [] | ||
|
||
for filename in compared_files: | ||
data = pandas.read_csv(os.path.join(data_path, filename)) | ||
data = np.array(data) | ||
sets.append(set(data[:, 1])) | ||
|
||
res = set.intersection(*sets) | ||
out_info = np.array([signature_to_url_and_fname(entry) for entry in res]) | ||
np.savetxt(report_path, out_info, fmt='%s', delimiter='\n', newline='\n\n') | ||
|
||
# m = [list(i) for i in itertools.combinations(compared_files, 2)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.