Skip to content

Commit

Permalink
Milestone before writing a paper for SEIM-2018
Browse files Browse the repository at this point in the history
Code for feature extraction and anomaly detection with LOF and One-class
SVM is present, as well as a few helper scripts.
  • Loading branch information
ksmirenko committed Jan 24, 2018
1 parent af7941b commit f46016f
Show file tree
Hide file tree
Showing 48 changed files with 2,108 additions and 136 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@
**/out/

data/
out-data/
plots/

repos/
repo-fetching/repos.json

**/.gradle/
.idea/
*.iml

**/__pycache__/

*~
**/.DS_Store
38 changes: 38 additions & 0 deletions analyzer/common_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import numpy as np
import matplotlib.pyplot as plt


def read_data(csv_path):
labels = np.genfromtxt(csv_path, delimiter=',', usecols=[0], dtype=None)
raw_data = np.genfromtxt(csv_path, delimiter=',', comments=None)[:, 1:]
return labels, raw_data


def print_plots(inliers, outliers, img_out_path, is_for_methods):
def draw_subplot(x_label, y_label):
plt.subplot(2, 2, draw_subplot.counter)
x_index = columns[x_label]
y_index = columns[y_label]
plt.scatter(inliers[:, x_index], inliers[:, y_index], c='blue', edgecolor='k')
plt.scatter(outliers[:, x_index], outliers[:, y_index], c='red', edgecolor='k')
plt.xlabel(x_label)
plt.ylabel(y_label)
draw_subplot.counter += 1

draw_subplot.counter = 1
if is_for_methods:
columns = {'SLoC': 1, 'AST nodes': 2, 'AST height': 3, 'Loop nesting depth': 4, 'Cyclomatic complexity': 5}
plt.figure(num='Methods', figsize=(12, 8))
draw_subplot('SLoC', 'AST nodes')
draw_subplot('AST nodes', 'AST height')
draw_subplot('SLoC', 'Cyclomatic complexity')
draw_subplot('Loop nesting depth', 'Cyclomatic complexity')
else:
columns = {'LoC': 1, 'SLoC': 2, 'AST nodes': 3, 'AST height': 4}
plt.figure(num='Files', figsize=(12, 8))
draw_subplot('LoC', 'SLoC')
draw_subplot('SLoC', 'AST nodes')
draw_subplot('SLoC', 'AST height')

plt.tight_layout()
plt.savefig(img_out_path)
149 changes: 149 additions & 0 deletions analyzer/methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import csv
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas
import time

# noinspection PyUnresolvedReferences
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.model_selection import ParameterGrid
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import scale
from sklearn.svm import OneClassSVM

dataset_name = "top1k"
is_drawing = False

out_dir = f"../out-data/"
csv_in_path = f"../data/{dataset_name}_methods.csv"
out_path = f"{out_dir}methods"
log_path = f"{out_dir}methods.log"

if not os.path.exists(out_dir):
os.makedirs(out_dir)
log_file = open(log_path, mode='w+')


def log(s):
print(s)
log_file.write(s)
log_file.write('\n')


start_time = time.time()

# Load input
methods = pandas.read_csv(csv_in_path, header=0, delimiter='\t', quoting=csv.QUOTE_NONE, error_bad_lines=True,
engine='python')

# Fix potential problems in input
X = np.array(methods.values[:, 1:], dtype="float64")
ok_lines = np.array([~np.isnan(row).any() for row in X])
methods = methods[ok_lines]
X = X[ok_lines]
n_methods = methods.shape[0]

# Preprocessing
X = scale(X)
# X = PCA(n_components=3).fit_transform(X)

# All configs
all_clf_configs = [
{
'clf_name': 'lof',
'clf': LocalOutlierFactor(n_jobs=-1),
'param_grid': {
'n_neighbors': [10, 5, 2],
'algorithm': ['ball_tree', 'kd_tree'],
'contamination': [0.00005, 0.0001]
}
},
{
'clf_name': 'svm',
'clf': OneClassSVM(shrinking=True),
'param_grid': [
{
'kernel': ['linear'],
'nu': [0.00005]
},
{
'kernel': ['rbf', 'poly'],
'nu': [0.00005, 0.0001],
'gamma': [0.1]
}
]
}
]
# Configs for the current run
clf_configs = [clf_config for clf_config in all_clf_configs if clf_config['clf_name'] in ('lof', 'svm')]

for clf_config in clf_configs:
clf_name = clf_config['clf_name']
clf = clf_config['clf']
param_sets = list(ParameterGrid(clf_config['param_grid']))
log(clf_name)

# For calculating 'intersection', i.e. methods marked as anomalous
# by the current classifier with all param sets
all_indices = np.arange(0, n_methods)
intersect_outlier_indices = all_indices

for params in param_sets:
param_set_desc = str(params)
log(f"\t{param_set_desc}")

# Fit the model and mark data
clf.set_params(**params)
if clf_name == 'lof':
marks = clf.fit_predict(X)
elif clf_name == 'svm':
clf.fit(X)
# Suppressed warning below: clf is in dictionary
# noinspection PyUnresolvedReferences
marks = clf.predict(X)
else:
log(f"Error: unknown classifier name {clf_name}!")
exit(1)

# Suppressed warning below: either `marks` is assigned, or the whole program exits with an error
# noinspection PyUnboundLocalVariable
inlier_indices = np.asarray([mark > 0 for mark in marks])
outlier_indices = np.asarray([mark < 0 for mark in marks])
intersect_outlier_indices = np.intersect1d(intersect_outlier_indices, all_indices[outlier_indices])

X_inliers = X[inlier_indices]
X_outliers = X[outlier_indices]
n_inliers = X_inliers.shape[0]
n_outliers = X_outliers.shape[0]
log(f"\t\tInliers:\t{n_inliers:6}/{n_methods:6}\t{(n_inliers * 100 / n_methods):10.7}%")
log(f"\t\tOutliers:\t{n_outliers:6}/{n_methods:6}\t{(n_outliers * 100 / n_methods):10.7}%")

if n_outliers > n_inliers:
X_temp = X_inliers
X_inliers = X_outliers
X_outliers = X_temp
log("\t\tSwapped 'inliers' and 'outliers', because there were more outliers than inliers!")

if is_drawing:
# Show the principal components on 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_inliers[:, 1], X_inliers[:, 0], X_inliers[:, 2], c='None', edgecolor='blue', marker='o')
ax.scatter(X_outliers[:, 1], X_outliers[:, 0], X_outliers[:, 2], c='red', marker='^')
plt.savefig(f"{out_path} {clf_name} {param_set_desc}.png")

# Save output of this configuration to file
outlier_names = methods.values[:][outlier_indices]
dataframe = pandas.DataFrame(outlier_names)
dataframe.to_csv(f"{out_path} {clf_name} {param_set_desc}.csv")

# Save the 'intersection' to file
intersect_outlier_names = methods.values[:, 0][intersect_outlier_indices]
dataframe = pandas.DataFrame(intersect_outlier_names)
dataframe.to_csv(f"{out_path} {clf_name} intersection.csv")

end_time = time.time()
log(f"Total elapsed time: {end_time - start_time}")
log_file.close()
27 changes: 27 additions & 0 deletions analyzer/plots3d.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

csv_path = '../data/6proj_methodMetrics.csv'
csv_marks_path = '../data/6proj_methods_marked_svm.csv'

raw_data = np.genfromtxt(csv_path, delimiter=',', comments=None)[:, 1:]
marks = np.genfromtxt(csv_marks_path)
marked = np.column_stack((raw_data, marks))

inlier_indices = np.asarray([int(row[-1]) > 0 for row in marked])
outlier_indices = np.asarray([int(row[-1]) < 0 for row in marked])
inliers = raw_data[inlier_indices]
outliers = raw_data[outlier_indices]

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(outliers[:, 0], outliers[:, 1], outliers[:, 4], c='red', marker='^')
# ax.scatter(inliers[:, 0], inliers[:, 1], inliers[:, 4], c='blue', marker='o')

ax.set_xlabel('SLoC')
ax.set_ylabel('AST nodes')
ax.set_zlabel('Cyclomatic complexity')

plt.show()
43 changes: 43 additions & 0 deletions analyzer/results-analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import itertools
import os
import numpy as np
import pandas

data_path = "../out-data/out_top1k_no_pca"
compared_files = [
# "methods lof {'algorithm': 'ball_tree', 'contamination': 5e-05, 'n_neighbors': 2}.csv",
# "methods lof {'algorithm': 'ball_tree', 'contamination': 5e-05, 'n_neighbors': 5}.csv",
"methods lof {'algorithm': 'ball_tree', 'contamination': 5e-05, 'n_neighbors': 10}.csv",
# "methods lof {'algorithm': 'kd_tree', 'contamination': 5e-05, 'n_neighbors': 2}.csv",
# "methods lof {'algorithm': 'kd_tree', 'contamination': 5e-05, 'n_neighbors': 5}.csv",
# "methods lof {'algorithm': 'kd_tree', 'contamination': 5e-05, 'n_neighbors': 10}.csv",
]
report_path = "../out-data/report_2.txt"


def signature_to_url_and_fname(signature):
signature = signature[6:] # drop 'repos/'
split_1 = signature.split(sep='__', maxsplit=1)
account = split_1[0]
signature = split_1[1]
split_2 = signature.split(sep='.kt:', maxsplit=1)
repo_and_filename = split_2[0]
fun_name = split_2[1]
split_3 = repo_and_filename.split(sep='/', maxsplit=1)
repo = split_3[0]
filepath = split_3[1]
return np.array([f"https://github.com/{account}/{repo}/blob/master/{filepath}.kt", fun_name])


sets = []

for filename in compared_files:
data = pandas.read_csv(os.path.join(data_path, filename))
data = np.array(data)
sets.append(set(data[:, 1]))

res = set.intersection(*sets)
out_info = np.array([signature_to_url_and_fname(entry) for entry in res])
np.savetxt(report_path, out_info, fmt='%s', delimiter='\n', newline='\n\n')

# m = [list(i) for i in itertools.combinations(compared_files, 2)]
9 changes: 6 additions & 3 deletions metrics-calc/build.gradle
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
buildscript {
ext.kotlin_version = '1.1.51'
ext {
kotlin_version = '1.2.10'
old_kotlin_version = '1.1.0'
}

repositories {
jcenter()
Expand Down Expand Up @@ -27,7 +30,7 @@ repositories {
}

dependencies {
compile "org.jetbrains.kotlin:kotlin-compiler:$kotlin_version"
compile "org.jetbrains.kotlin:kotlin-compiler:$old_kotlin_version"
testCompile group: 'junit', name: 'junit', version: '4.12'
}

Expand All @@ -36,4 +39,4 @@ compileKotlin {
}
compileTestKotlin {
kotlinOptions.jvmTarget = "1.8"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ import java.io.File
class KotlinFileFinder(
private val rootDirectory: String
) {
private val ignoredFiles = hashMapOf(
"amc.kt" to "amobconf__awesome-mobile-conferences/.github/amc.kt"
)

private val ktFilesList = ArrayList<File>()

fun search(): List<File> {
Expand All @@ -13,19 +17,27 @@ class KotlinFileFinder(
return ktFilesList
}

private fun search(dir: File) {
if (!dir.isDirectory || !dir.canRead()) {
private fun search(file: File) {
if (!file.canRead()) {
return
}

for (file in dir.listFiles()) {
if (file.isDirectory) {
search(file)
} else {
if (file.extension == "kt") {
ktFilesList.add(file)
}
if (!file.isDirectory) {
// Single file
if (file.extension == "kt" && !shouldIgnore(file)) {
ktFilesList.add(file)
}
return
}

// Traverse the directory
for (childFile in file.listFiles()) {
search(childFile)
}
}

private fun shouldIgnore(file: File): Boolean {
val pathSuffix = ignoredFiles[file.name] ?: return false
return file.absolutePath.endsWith(pathSuffix)
}
}
Loading

0 comments on commit f46016f

Please sign in to comment.