Milestone before writing a paper for SEIM-2018

Code for feature extraction and anomaly detection with LOF and One-class SVM is present, as well as a few helper scripts.
JetBrains-Research · Jan 24, 2018 · f46016f · f46016f
1 parent af7941b
commit f46016f
Show file tree

Hide file tree

Showing 48 changed files with 2,108 additions and 136 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,11 +2,17 @@
 **/out/
 
 data/
+out-data/
+plots/
+
 repos/
+repo-fetching/repos.json
 
 **/.gradle/
 .idea/
 *.iml
 
+**/__pycache__/
+
 *~
 **/.DS_Store
diff --git a/analyzer/common_io.py b/analyzer/common_io.py
@@ -0,0 +1,38 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def read_data(csv_path):
+    labels = np.genfromtxt(csv_path, delimiter=',', usecols=[0], dtype=None)
+    raw_data = np.genfromtxt(csv_path, delimiter=',', comments=None)[:, 1:]
+    return labels, raw_data
+
+
+def print_plots(inliers, outliers, img_out_path, is_for_methods):
+    def draw_subplot(x_label, y_label):
+        plt.subplot(2, 2, draw_subplot.counter)
+        x_index = columns[x_label]
+        y_index = columns[y_label]
+        plt.scatter(inliers[:, x_index], inliers[:, y_index], c='blue', edgecolor='k')
+        plt.scatter(outliers[:, x_index], outliers[:, y_index], c='red', edgecolor='k')
+        plt.xlabel(x_label)
+        plt.ylabel(y_label)
+        draw_subplot.counter += 1
+
+    draw_subplot.counter = 1
+    if is_for_methods:
+        columns = {'SLoC': 1, 'AST nodes': 2, 'AST height': 3, 'Loop nesting depth': 4, 'Cyclomatic complexity': 5}
+        plt.figure(num='Methods', figsize=(12, 8))
+        draw_subplot('SLoC', 'AST nodes')
+        draw_subplot('AST nodes', 'AST height')
+        draw_subplot('SLoC', 'Cyclomatic complexity')
+        draw_subplot('Loop nesting depth', 'Cyclomatic complexity')
+    else:
+        columns = {'LoC': 1, 'SLoC': 2, 'AST nodes': 3, 'AST height': 4}
+        plt.figure(num='Files', figsize=(12, 8))
+        draw_subplot('LoC', 'SLoC')
+        draw_subplot('SLoC', 'AST nodes')
+        draw_subplot('SLoC', 'AST height')
+
+    plt.tight_layout()
+    plt.savefig(img_out_path)
diff --git a/analyzer/methods.py b/analyzer/methods.py
@@ -0,0 +1,149 @@
+import csv
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pandas
+import time
+
+# noinspection PyUnresolvedReferences
+from mpl_toolkits.mplot3d import Axes3D
+from sklearn.decomposition import PCA
+from sklearn.model_selection import ParameterGrid
+from sklearn.neighbors import LocalOutlierFactor
+from sklearn.preprocessing import scale
+from sklearn.svm import OneClassSVM
+
+dataset_name = "top1k"
+is_drawing = False
+
+out_dir = f"../out-data/"
+csv_in_path = f"../data/{dataset_name}_methods.csv"
+out_path = f"{out_dir}methods"
+log_path = f"{out_dir}methods.log"
+
+if not os.path.exists(out_dir):
+    os.makedirs(out_dir)
+log_file = open(log_path, mode='w+')
+
+
+def log(s):
+    print(s)
+    log_file.write(s)
+    log_file.write('\n')
+
+
+start_time = time.time()
+
+# Load input
+methods = pandas.read_csv(csv_in_path, header=0, delimiter='\t', quoting=csv.QUOTE_NONE, error_bad_lines=True,
+                          engine='python')
+
+# Fix potential problems in input
+X = np.array(methods.values[:, 1:], dtype="float64")
+ok_lines = np.array([~np.isnan(row).any() for row in X])
+methods = methods[ok_lines]
+X = X[ok_lines]
+n_methods = methods.shape[0]
+
+# Preprocessing
+X = scale(X)
+# X = PCA(n_components=3).fit_transform(X)
+
+# All configs
+all_clf_configs = [
+    {
+        'clf_name': 'lof',
+        'clf': LocalOutlierFactor(n_jobs=-1),
+        'param_grid': {
+            'n_neighbors': [10, 5, 2],
+            'algorithm': ['ball_tree', 'kd_tree'],
+            'contamination': [0.00005, 0.0001]
+        }
+    },
+    {
+        'clf_name': 'svm',
+        'clf': OneClassSVM(shrinking=True),
+        'param_grid': [
+            {
+                'kernel': ['linear'],
+                'nu': [0.00005]
+            },
+            {
+                'kernel': ['rbf', 'poly'],
+                'nu': [0.00005, 0.0001],
+                'gamma': [0.1]
+            }
+        ]
+    }
+]
+# Configs for the current run
+clf_configs = [clf_config for clf_config in all_clf_configs if clf_config['clf_name'] in ('lof', 'svm')]
+
+for clf_config in clf_configs:
+    clf_name = clf_config['clf_name']
+    clf = clf_config['clf']
+    param_sets = list(ParameterGrid(clf_config['param_grid']))
+    log(clf_name)
+
+    # For calculating 'intersection', i.e. methods marked as anomalous
+    # by the current classifier with all param sets
+    all_indices = np.arange(0, n_methods)
+    intersect_outlier_indices = all_indices
+
+    for params in param_sets:
+        param_set_desc = str(params)
+        log(f"\t{param_set_desc}")
+
+        # Fit the model and mark data
+        clf.set_params(**params)
+        if clf_name == 'lof':
+            marks = clf.fit_predict(X)
+        elif clf_name == 'svm':
+            clf.fit(X)
+            # Suppressed warning below: clf is in dictionary
+            # noinspection PyUnresolvedReferences
+            marks = clf.predict(X)
+        else:
+            log(f"Error: unknown classifier name {clf_name}!")
+            exit(1)
+
+        # Suppressed warning below: either `marks` is assigned, or the whole program exits with an error
+        # noinspection PyUnboundLocalVariable
+        inlier_indices = np.asarray([mark > 0 for mark in marks])
+        outlier_indices = np.asarray([mark < 0 for mark in marks])
+        intersect_outlier_indices = np.intersect1d(intersect_outlier_indices, all_indices[outlier_indices])
+
+        X_inliers = X[inlier_indices]
+        X_outliers = X[outlier_indices]
+        n_inliers = X_inliers.shape[0]
+        n_outliers = X_outliers.shape[0]
+        log(f"\t\tInliers:\t{n_inliers:6}/{n_methods:6}\t{(n_inliers * 100 / n_methods):10.7}%")
+        log(f"\t\tOutliers:\t{n_outliers:6}/{n_methods:6}\t{(n_outliers * 100 / n_methods):10.7}%")
+
+        if n_outliers > n_inliers:
+            X_temp = X_inliers
+            X_inliers = X_outliers
+            X_outliers = X_temp
+            log("\t\tSwapped 'inliers' and 'outliers', because there were more outliers than inliers!")
+
+        if is_drawing:
+            # Show the principal components on 3D plot
+            fig = plt.figure()
+            ax = fig.add_subplot(111, projection='3d')
+            ax.scatter(X_inliers[:, 1], X_inliers[:, 0], X_inliers[:, 2], c='None', edgecolor='blue', marker='o')
+            ax.scatter(X_outliers[:, 1], X_outliers[:, 0], X_outliers[:, 2], c='red', marker='^')
+            plt.savefig(f"{out_path} {clf_name} {param_set_desc}.png")
+
+        # Save output of this configuration to file
+        outlier_names = methods.values[:][outlier_indices]
+        dataframe = pandas.DataFrame(outlier_names)
+        dataframe.to_csv(f"{out_path} {clf_name} {param_set_desc}.csv")
+
+    # Save the 'intersection' to file
+    intersect_outlier_names = methods.values[:, 0][intersect_outlier_indices]
+    dataframe = pandas.DataFrame(intersect_outlier_names)
+    dataframe.to_csv(f"{out_path} {clf_name} intersection.csv")
+
+end_time = time.time()
+log(f"Total elapsed time: {end_time - start_time}")
+log_file.close()
diff --git a/analyzer/plots3d.py b/analyzer/plots3d.py
@@ -0,0 +1,27 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+
+csv_path = '../data/6proj_methodMetrics.csv'
+csv_marks_path = '../data/6proj_methods_marked_svm.csv'
+
+raw_data = np.genfromtxt(csv_path, delimiter=',', comments=None)[:, 1:]
+marks = np.genfromtxt(csv_marks_path)
+marked = np.column_stack((raw_data, marks))
+
+inlier_indices = np.asarray([int(row[-1]) > 0 for row in marked])
+outlier_indices = np.asarray([int(row[-1]) < 0 for row in marked])
+inliers = raw_data[inlier_indices]
+outliers = raw_data[outlier_indices]
+
+fig = plt.figure()
+ax = fig.add_subplot(111, projection='3d')
+
+ax.scatter(outliers[:, 0], outliers[:, 1], outliers[:, 4], c='red', marker='^')
+# ax.scatter(inliers[:, 0], inliers[:, 1], inliers[:, 4], c='blue', marker='o')
+
+ax.set_xlabel('SLoC')
+ax.set_ylabel('AST nodes')
+ax.set_zlabel('Cyclomatic complexity')
+
+plt.show()
diff --git a/analyzer/results-analysis.py b/analyzer/results-analysis.py
@@ -0,0 +1,43 @@
+import itertools
+import os
+import numpy as np
+import pandas
+
+data_path = "../out-data/out_top1k_no_pca"
+compared_files = [
+    # "methods lof {'algorithm': 'ball_tree', 'contamination': 5e-05, 'n_neighbors': 2}.csv",
+    # "methods lof {'algorithm': 'ball_tree', 'contamination': 5e-05, 'n_neighbors': 5}.csv",
+    "methods lof {'algorithm': 'ball_tree', 'contamination': 5e-05, 'n_neighbors': 10}.csv",
+    # "methods lof {'algorithm': 'kd_tree', 'contamination': 5e-05, 'n_neighbors': 2}.csv",
+    # "methods lof {'algorithm': 'kd_tree', 'contamination': 5e-05, 'n_neighbors': 5}.csv",
+    # "methods lof {'algorithm': 'kd_tree', 'contamination': 5e-05, 'n_neighbors': 10}.csv",
+]
+report_path = "../out-data/report_2.txt"
+
+
+def signature_to_url_and_fname(signature):
+    signature = signature[6:]  # drop 'repos/'
+    split_1 = signature.split(sep='__', maxsplit=1)
+    account = split_1[0]
+    signature = split_1[1]
+    split_2 = signature.split(sep='.kt:', maxsplit=1)
+    repo_and_filename = split_2[0]
+    fun_name = split_2[1]
+    split_3 = repo_and_filename.split(sep='/', maxsplit=1)
+    repo = split_3[0]
+    filepath = split_3[1]
+    return np.array([f"https://github.com/{account}/{repo}/blob/master/{filepath}.kt", fun_name])
+
+
+sets = []
+
+for filename in compared_files:
+    data = pandas.read_csv(os.path.join(data_path, filename))
+    data = np.array(data)
+    sets.append(set(data[:, 1]))
+
+res = set.intersection(*sets)
+out_info = np.array([signature_to_url_and_fname(entry) for entry in res])
+np.savetxt(report_path, out_info, fmt='%s', delimiter='\n', newline='\n\n')
+
+# m = [list(i) for i in itertools.combinations(compared_files, 2)]
diff --git a/metrics-calc/build.gradle b/metrics-calc/build.gradle
@@ -1,5 +1,8 @@
 buildscript {
-    ext.kotlin_version = '1.1.51'
+    ext {
+        kotlin_version = '1.2.10'
+        old_kotlin_version = '1.1.0'
+    }
 
     repositories {
         jcenter()
@@ -27,7 +30,7 @@ repositories {
 }
 
 dependencies {
-    compile "org.jetbrains.kotlin:kotlin-compiler:$kotlin_version"
+    compile "org.jetbrains.kotlin:kotlin-compiler:$old_kotlin_version"
     testCompile group: 'junit', name: 'junit', version: '4.12'
 }
 
@@ -36,4 +39,4 @@ compileKotlin {
 }
 compileTestKotlin {
     kotlinOptions.jvmTarget = "1.8"
-}
+}
diff --git a/metrics-calc/src/main/kotlin/io/github/ksmirenko/kotlin/metricsCalc/KotlinFileFinder.kt b/metrics-calc/src/main/kotlin/io/github/ksmirenko/kotlin/metricsCalc/KotlinFileFinder.kt
@@ -5,6 +5,10 @@ import java.io.File
 class KotlinFileFinder(
         private val rootDirectory: String
 ) {
+    private val ignoredFiles = hashMapOf(
+            "amc.kt" to "amobconf__awesome-mobile-conferences/.github/amc.kt"
+    )
+
     private val ktFilesList = ArrayList<File>()
 
     fun search(): List<File> {
@@ -13,19 +17,27 @@ class KotlinFileFinder(
         return ktFilesList
     }
 
-    private fun search(dir: File) {
-        if (!dir.isDirectory || !dir.canRead()) {
+    private fun search(file: File) {
+        if (!file.canRead()) {
             return
         }
 
-        for (file in dir.listFiles()) {
-            if (file.isDirectory) {
-                search(file)
-            } else {
-                if (file.extension == "kt") {
-                    ktFilesList.add(file)
-                }
+        if (!file.isDirectory) {
+            // Single file
+            if (file.extension == "kt" && !shouldIgnore(file)) {
+                ktFilesList.add(file)
             }
+            return
+        }
+
+        // Traverse the directory
+        for (childFile in file.listFiles()) {
+            search(childFile)
         }
     }
+
+    private fun shouldIgnore(file: File): Boolean {
+        val pathSuffix = ignoredFiles[file.name] ?: return false
+        return file.absolutePath.endsWith(pathSuffix)
+    }
 }