bug fix

seraogianluca · Jun 30, 2020 · 174a1e9 · 174a1e9
1 parent 708689b
commit 174a1e9
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 38 deletions.
diff --git a/k-means/src/main/java/it/unipi/hadoop/KMeans.java b/k-means/src/main/java/it/unipi/hadoop/KMeans.java
@@ -13,6 +13,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
@@ -100,11 +101,13 @@ public void reduce(IntWritable centroid, Iterable<Point> partialSums, Context co
 
     private static boolean stoppingCriterion(Point[] oldCentroids, Point[] newCentroids, int distance, float threshold) {
         boolean check = true;
-        for(int i = 0; i < oldCentroids.length; i++) {
-            check = oldCentroids[i].distance(newCentroids[i], distance) <= threshold;
-            if (!check) {
-                return false;
-            }
+        float oldNorm = 0.0f;
+        float newNorm = 0.0f;
+        oldNorm = Point.frobeniusNorm(oldCentroids);
+        newNorm = Point.frobeniusNorm(newCentroids);
+        check = Math.abs(newNorm - oldNorm) <= threshold;
+        if(!check) {
+            return false;
         }
         return true;
     }
@@ -152,17 +155,18 @@ private static Point[] centroidsInit(Configuration conf, String pathString, int
 
     private static Point[] readCentroids(Configuration conf, int k, String pathString) throws IOException {
         Point[] points = new Point[k];
-        FileSystem hdfs = FileSystem.get(conf);	
+        FileSystem hdfs = FileSystem.get(conf);
+        FileStatus[] status = hdfs.listStatus(new Path(pathString));	
 
-        for (int i = 0; i < k; i++) {
-            Path path = new Path(pathString + "/part-r-0000" + i);
-            BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(path)));
-
-            String[] keyValueSplit = br.readLine().split("\t"); //Split line in K,V
-            int centroidId = Integer.parseInt(keyValueSplit[0]);
-            String[] point = keyValueSplit[1].split(",");
-            points[centroidId] = new Point(point);
-            br.close();
+        for (int i = 0; i < status.length; i++) {
+            if(!status[i].getPath().toString().endsWith("_SUCCESS")) {
+                BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(status[i].getPath())));
+                String[] keyValueSplit = br.readLine().split("\t"); //Split line in K,V
+                int centroidId = Integer.parseInt(keyValueSplit[0]);
+                String[] point = keyValueSplit[1].split(",");
+                points[centroidId] = new Point(point);
+                br.close();
+            }
         }
         hdfs.delete(new Path(pathString), true); //delete temp directory
 

diff --git a/k-means/src/main/java/it/unipi/hadoop/model/Point.java b/k-means/src/main/java/it/unipi/hadoop/model/Point.java
@@ -104,15 +104,27 @@ public float distance(Point p, int h){
             for (int i = 0; i < this.dim; i++) {
                 dist += Math.pow(Math.abs(this.components[i] - p.components[i]), h);
             }
-            dist = (float)Math.pow(dist, 1f/h);
+            dist = (float)Math.round(Math.pow(dist, 1f/h)*100000)/100000.0f;
             return dist;
         }
     }
 
     public void average() {
         for (int i = 0; i < this.dim; i++) {
-            this.components[i] /= this.numPoints;
+            float temp = this.components[i] / this.numPoints;
+            this.components[i] = (float)Math.round(temp*100000)/100000.0f;
         }
         this.numPoints = 1;
     }
+
+    public static float frobeniusNorm(Point[] points) {
+        float norm = 0.0f;
+        for(int i = 0; i < points.length; i++) {
+            for(int j = 0; j < points[i].dim; j++) {
+                norm += Math.pow(points[i].components[j], 2);
+            }
+        }
+        norm = (float)Math.round(Math.sqrt(norm)*100000)/100000.0f;
+        return norm;
+    }
 }
diff --git a/scripts/average.py b/scripts/average.py
@@ -1,6 +1,6 @@
 from statistics import mean
 
-path = "../benchmarks/100k/output_7_7.txt"
+path = "../benchmarks/1k/output_3_7.txt"
 
 with open(path, "r") as file:
     times = []

diff --git a/scripts/benchmark.ipynb b/scripts/benchmark.ipynb
@@ -2,58 +2,55 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 173,
    "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.cluster import KMeans\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "import time"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 174,
    "metadata": {
     "scrolled": true
    },
    "outputs": [],
    "source": [
-    "\n",
-    "%%capture cap\n",
-    "%%time\n",
     "points = []\n",
     "clusters= 7\n",
-    "with open(\"./100k/dataset_3_7.txt\", \"r\") as file:\n",
-    "    \n",
+    "start_time = time.time() * 1000.0\n",
+    "with open(\"../datasets/1k/dataset_3_7.txt\", \"r\") as file:\n",
     "    for line in file:\n",
     "        comps = line.split(\",\")\n",
     "        point = [float(comps[i]) for i in range (len(comps)) ] \n",
     "        points.append(point)\n",
     "        \n",
     "dataset = np.array(points)\n",
-    "kmeans = KMeans(n_clusters=clusters, random_state=0).fit(dataset)\n"
+    "kmeans = KMeans(n_clusters=clusters, init='random', precompute_distances=False, random_state=0).fit(dataset)\n",
+    "end_time = time.time() * 1000.0\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 175,
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open('./100k/output_3_7.txt', 'w') as f:\n",
-    "    f.write(cap.stdout + '\\n')\n",
-    "    f.write('centroids: '+ str(kmeans.cluster_centers_))\n",
-    "    f.write('\\n')\n",
-    "    f.write('n_iter: ' + str(kmeans.n_iter_))\n",
-    "    "
+    "with open('../benchmarks/1k/output_3_7.txt', 'a') as f:\n",
+    "    f.write(\"execution time: \" + str(round(end_time - start_time, 4)) + \" ms\" + '\\n')\n",
+    "    f.write('centroids: \\n' + str(kmeans.cluster_centers_) + '\\n')\n",
+    "    f.write('n_iter: ' + str(kmeans.n_iter_) + '\\n')    "
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.7.5 64-bit",
+   "display_name": "Python 3.8.2 64-bit",
    "language": "python",
-   "name": "python37564bitc1d57a1296b7464f9f79002004778ace"
+   "name": "python38264bit165d96167a304b48a1b40131d4648cb9"
   },
   "language_info": {
    "codemirror_mode": {
@@ -65,9 +62,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.5"
+   "version": "3.8.2-final"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/scripts/run.sh b/scripts/run.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+for i in $(seq $2)
+do
+    echo "Run: "$i >> run.txt
+    hadoop jar k-means-1.0-SNAPSHOT.jar it.unipi.hadoop.KMeans $1 centroids/centroids$i >> run.txt
+    echo "centroids:" >> run.txt
+    hadoop fs -get centroids/centroids$i/centroids.txt ./centroids$i.txt
+    cat ./centroids$i.txt >> run.txt
+    rm ./centroids$i.txt
+done