Skip to content
This repository was archived by the owner on Jun 8, 2021. It is now read-only.

Commit

Permalink
bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Gianluca Serao authored and Gianluca Serao committed Jun 30, 2020
1 parent 708689b commit 174a1e9
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 38 deletions.
34 changes: 19 additions & 15 deletions k-means/src/main/java/it/unipi/hadoop/KMeans.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
Expand Down Expand Up @@ -100,11 +101,13 @@ public void reduce(IntWritable centroid, Iterable<Point> partialSums, Context co

private static boolean stoppingCriterion(Point[] oldCentroids, Point[] newCentroids, int distance, float threshold) {
boolean check = true;
for(int i = 0; i < oldCentroids.length; i++) {
check = oldCentroids[i].distance(newCentroids[i], distance) <= threshold;
if (!check) {
return false;
}
float oldNorm = 0.0f;
float newNorm = 0.0f;
oldNorm = Point.frobeniusNorm(oldCentroids);
newNorm = Point.frobeniusNorm(newCentroids);
check = Math.abs(newNorm - oldNorm) <= threshold;
if(!check) {
return false;
}
return true;
}
Expand Down Expand Up @@ -152,17 +155,18 @@ private static Point[] centroidsInit(Configuration conf, String pathString, int

private static Point[] readCentroids(Configuration conf, int k, String pathString) throws IOException {
Point[] points = new Point[k];
FileSystem hdfs = FileSystem.get(conf);
FileSystem hdfs = FileSystem.get(conf);
FileStatus[] status = hdfs.listStatus(new Path(pathString));

for (int i = 0; i < k; i++) {
Path path = new Path(pathString + "/part-r-0000" + i);
BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(path)));

String[] keyValueSplit = br.readLine().split("\t"); //Split line in K,V
int centroidId = Integer.parseInt(keyValueSplit[0]);
String[] point = keyValueSplit[1].split(",");
points[centroidId] = new Point(point);
br.close();
for (int i = 0; i < status.length; i++) {
if(!status[i].getPath().toString().endsWith("_SUCCESS")) {
BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(status[i].getPath())));
String[] keyValueSplit = br.readLine().split("\t"); //Split line in K,V
int centroidId = Integer.parseInt(keyValueSplit[0]);
String[] point = keyValueSplit[1].split(",");
points[centroidId] = new Point(point);
br.close();
}
}
hdfs.delete(new Path(pathString), true); //delete temp directory

Expand Down
16 changes: 14 additions & 2 deletions k-means/src/main/java/it/unipi/hadoop/model/Point.java
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,27 @@ public float distance(Point p, int h){
for (int i = 0; i < this.dim; i++) {
dist += Math.pow(Math.abs(this.components[i] - p.components[i]), h);
}
dist = (float)Math.pow(dist, 1f/h);
dist = (float)Math.round(Math.pow(dist, 1f/h)*100000)/100000.0f;
return dist;
}
}

public void average() {
for (int i = 0; i < this.dim; i++) {
this.components[i] /= this.numPoints;
float temp = this.components[i] / this.numPoints;
this.components[i] = (float)Math.round(temp*100000)/100000.0f;
}
this.numPoints = 1;
}

public static float frobeniusNorm(Point[] points) {
float norm = 0.0f;
for(int i = 0; i < points.length; i++) {
for(int j = 0; j < points[i].dim; j++) {
norm += Math.pow(points[i].components[j], 2);
}
}
norm = (float)Math.round(Math.sqrt(norm)*100000)/100000.0f;
return norm;
}
}
2 changes: 1 addition & 1 deletion scripts/average.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from statistics import mean

path = "../benchmarks/100k/output_7_7.txt"
path = "../benchmarks/1k/output_3_7.txt"

with open(path, "r") as file:
times = []
Expand Down
37 changes: 17 additions & 20 deletions scripts/benchmark.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,58 +2,55 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 173,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.cluster import KMeans\n",
"import numpy as np"
"import numpy as np\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 174,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"\n",
"%%capture cap\n",
"%%time\n",
"points = []\n",
"clusters= 7\n",
"with open(\"./100k/dataset_3_7.txt\", \"r\") as file:\n",
" \n",
"start_time = time.time() * 1000.0\n",
"with open(\"../datasets/1k/dataset_3_7.txt\", \"r\") as file:\n",
" for line in file:\n",
" comps = line.split(\",\")\n",
" point = [float(comps[i]) for i in range (len(comps)) ] \n",
" points.append(point)\n",
" \n",
"dataset = np.array(points)\n",
"kmeans = KMeans(n_clusters=clusters, random_state=0).fit(dataset)\n"
"kmeans = KMeans(n_clusters=clusters, init='random', precompute_distances=False, random_state=0).fit(dataset)\n",
"end_time = time.time() * 1000.0\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 175,
"metadata": {},
"outputs": [],
"source": [
"with open('./100k/output_3_7.txt', 'w') as f:\n",
" f.write(cap.stdout + '\\n')\n",
" f.write('centroids: '+ str(kmeans.cluster_centers_))\n",
" f.write('\\n')\n",
" f.write('n_iter: ' + str(kmeans.n_iter_))\n",
" "
"with open('../benchmarks/1k/output_3_7.txt', 'a') as f:\n",
" f.write(\"execution time: \" + str(round(end_time - start_time, 4)) + \" ms\" + '\\n')\n",
" f.write('centroids: \\n' + str(kmeans.cluster_centers_) + '\\n')\n",
" f.write('n_iter: ' + str(kmeans.n_iter_) + '\\n') "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.5 64-bit",
"display_name": "Python 3.8.2 64-bit",
"language": "python",
"name": "python37564bitc1d57a1296b7464f9f79002004778ace"
"name": "python38264bit165d96167a304b48a1b40131d4648cb9"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -65,9 +62,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
"version": "3.8.2-final"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
}
11 changes: 11 additions & 0 deletions scripts/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

for i in $(seq $2)
do
echo "Run: "$i >> run.txt
hadoop jar k-means-1.0-SNAPSHOT.jar it.unipi.hadoop.KMeans $1 centroids/centroids$i >> run.txt
echo "centroids:" >> run.txt
hadoop fs -get centroids/centroids$i/centroids.txt ./centroids$i.txt
cat ./centroids$i.txt >> run.txt
rm ./centroids$i.txt
done

0 comments on commit 174a1e9

Please sign in to comment.