Removed dead code and updated docs

Andrey Oskin · Andrey Oskin · commit 54c8d37a7fae · 2020-04-13T11:43:11.000+03:00
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,4 +1,4 @@
-# ParallelKMeans.jl Package
+# [ParallelKMeans.jl Package](https://github.com/PyDataBlog/ParallelKMeans.jl)
 
 ```@contents
 Depth = 4
@@ -59,7 +59,7 @@ git checkout experimental
 
 - [X] Implementation of [Hamerly implementation](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster).
 - [X] Interface for inclusion in Alan Turing Institute's [MLJModels](https://github.com/alan-turing-institute/MLJModels.jl#who-is-this-repo-for).
-- [ ] Full Implementation of Triangle inequality based on [Elkan - 2003 Using the Triangle Inequality to Accelerate K-Means"](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf).
+- [X] Full Implementation of Triangle inequality based on [Elkan - 2003 Using the Triangle Inequality to Accelerate K-Means"](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf).
 - [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf).
 - [ ] Native support for tabular data inputs outside of MLJModels' interface.
 - [ ] Refactoring and finalizaiton of API desgin.
@@ -177,6 +177,7 @@ ________________________________________________________________________________
 
 - 0.1.0 Initial release
 - 0.1.1 Added interface for MLJ
+- 0.1.2 Added Elkan algorithm
 
 ## Contributing
 
diff --git a/src/ParallelKMeans.jl b/src/ParallelKMeans.jl
@@ -1,10 +1,12 @@
 module ParallelKMeans
 
 using StatsBase
-using MLJModelInterface
+import MLJModelInterface
 import Base.Threads: @spawn
 import Distances
 
+const MMI = MLJModelInterface
+
 include("seeding.jl")
 include("kmeans.jl")
 include("lloyd.jl")
diff --git a/src/kmeans.jl b/src/kmeans.jl
@@ -109,18 +109,6 @@ design matrix(x), centroids (centre), and the number of desired groups (k).
 
 A Float type representing the computed metric is returned.
 """
-function sum_of_squares(x, labels, centre)
-    s = 0.0
-
-    @inbounds for j in axes(x, 2)
-        for i in axes(x, 1)
-            s += (x[i, j] - centre[i, labels[j]])^2
-        end
-    end
-
-    return s
-end
-
 function sum_of_squares(containers, x, labels, centre, r, idx)
     s = 0.0
 
@@ -171,100 +159,3 @@ function kmeans(alg, design_matrix, k;
                     k_init = k_init, max_iters = max_iters, tol = tol,
                     verbose = verbose, init = init)
 end
-
-
-"""
-    Kmeans!(alg::AbstractKMeansAlg, containers, design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=false)
-
-Mutable version of `kmeans` function. Definition of arguments and results can be
-found in `kmeans`.
-
-Argument `containers` represent algorithm specific containers, such as labels, intermidiate
-centroids and so on, which are used during calculations.
-"""
-function kmeans!(alg, containers, design_matrix, k;
-                n_threads = Threads.nthreads(),
-                k_init = "k-means++", max_iters = 300,
-                tol = 1e-6, verbose = false, init = nothing)
-    nrow, ncol = size(design_matrix)
-    centroids = init == nothing ? smart_init(design_matrix, k, n_threads, init=k_init).centroids : deepcopy(init)
-
-    converged = false
-    niters = 0
-    J_previous = 0.0
-
-    # Update centroids & labels with closest members until convergence
-
-    while niters < max_iters
-        niters += 1
-
-        update_containers!(containers, alg, centroids, n_threads)
-        J = update_centroids!(centroids, containers, alg, design_matrix, n_threads)
-
-        if verbose
-            # Show progress and terminate if J stopped decreasing.
-            println("Iteration $niters: Jclust = $J")
-        end
-
-        # Check for convergence
-        if (niters > 1) & (abs(J - J_previous) < (tol * J))
-            converged = true
-            break
-        end
-
-        J_previous = J
-
-    end
-
-    totalcost = sum_of_squares(design_matrix, containers.labels, centroids)
-
-    # Terminate algorithm with the assumption that K-means has converged
-    if verbose & converged
-        println("Successfully terminated with convergence.")
-    end
-
-    # TODO empty placeholder vectors should be calculated
-    # TODO Float64 type definitions is too restrictive, should be relaxed
-    # especially during GPU related development
-    return KmeansResult(centroids, containers.labels, Float64[], Int[], Float64[], totalcost, niters, converged)
-end
-
-"""
-    update_centroids!(centroids, containers, alg, design_matrix, n_threads)
-
-Internal function, used to update centroids by utilizing one of `alg`. It works as
-a wrapper of internal `chunk_update_centroids!` function, splitting incoming
-`design_matrix` in chunks and combining results together.
-"""
-function update_centroids!(centroids, containers, alg, design_matrix, n_threads)
-    ncol = size(design_matrix, 2)
-
-    if n_threads == 1
-        r = axes(design_matrix, 2)
-        J = chunk_update_centroids!(centroids, containers, alg, design_matrix, r, 1)
-
-        centroids .= containers.new_centroids[1] ./ containers.centroids_cnt[1]'
-    else
-        ranges = splitter(ncol, n_threads)
-
-        waiting_list = Vector{Task}(undef, n_threads - 1)
-
-        for i in 1:length(ranges) - 1
-            waiting_list[i] = @spawn chunk_update_centroids!(centroids, containers,
-                alg, design_matrix, ranges[i], i + 1)
-        end
-
-        J = chunk_update_centroids!(centroids, containers, alg, design_matrix, ranges[end], 1)
-
-        J += sum(fetch.(waiting_list))
-
-        for i in 1:length(ranges) - 1
-            containers.new_centroids[1] .+= containers.new_centroids[i + 1]
-            containers.centroids_cnt[1] .+= containers.centroids_cnt[i + 1]
-        end
-
-        centroids .= containers.new_centroids[1] ./ containers.centroids_cnt[1]'
-    end
-
-    return J
-end
diff --git a/src/mlj_interface.jl b/src/mlj_interface.jl
@@ -11,7 +11,7 @@ const MLJDICT = Dict(:Lloyd => Lloyd(),
 #### MODEL DEFINITION
 ####
 
-mutable struct KMeans <: MLJModelInterface.Unsupervised
+mutable struct KMeans <: MMI.Unsupervised
     algo::Symbol
     k_init::String
     k::Int
@@ -29,13 +29,13 @@ function KMeans(; algo=:Hamerly, k_init="k-means++",
                 threads=Threads.nthreads(), verbosity=0, init=nothing)
 
     model   = KMeans(algo, k_init, k, tol, max_iters, copy, threads, verbosity, init)
-    message = MLJModelInterface.clean!(model)
+    message = MMI.clean!(model)
     isempty(message) || @warn message
     return model
 end
 
 
-function MLJModelInterface.clean!(m::KMeans)
+function MMI.clean!(m::KMeans)
     warning = ""
 
     if !(m.algo ∈ keys(MLJDICT))
@@ -78,14 +78,14 @@ end
 
     See also the [package documentation](https://pydatablog.github.io/ParallelKMeans.jl/stable).
 """
-function MLJModelInterface.fit(m::KMeans, X)
+function MMI.fit(m::KMeans, X)
     # convert tabular input data into the matrix model expects. Column assumed as features so input data is permuted
     if !m.copy
         # permutes dimensions of input table without copying and pass to model
-        DMatrix = convert(Array{Float64, 2}, MLJModelInterface.matrix(X)')
+        DMatrix = convert(Array{Float64, 2}, MMI.matrix(X)')
     else
         # permutes dimensions of input table as a column major matrix from a copy of the data
-        DMatrix = convert(Array{Float64, 2}, MLJModelInterface.matrix(X, transpose=true))
+        DMatrix = convert(Array{Float64, 2}, MMI.matrix(X, transpose=true))
     end
 
     # lookup available algorithms
@@ -106,7 +106,7 @@ function MLJModelInterface.fit(m::KMeans, X)
 end
 
 
-function MLJModelInterface.fitted_params(model::KMeans, fitresult)
+function MMI.fitted_params(model::KMeans, fitresult)
     # extract what's relevant from `fitresult`
     results, _, _ = fitresult  # unpack fitresult
     centers = results.centers
@@ -124,15 +124,15 @@ end
 #### PREDICT FUNCTION
 ####
 
-function MLJModelInterface.transform(m::KMeans, fitresult, Xnew)
+function MMI.transform(m::KMeans, fitresult, Xnew)
     # make predictions/assignments using the learned centroids
 
     if !m.copy
         # permutes dimensions of input table without copying and pass to model
-        DMatrix = convert(Array{Float64, 2}, MLJModelInterface.matrix(Xnew)')
+        DMatrix = convert(Array{Float64, 2}, MMI.matrix(Xnew)')
     else
         # permutes dimensions of input table as a column major matrix from a copy of the data
-        DMatrix = convert(Array{Float64, 2}, MLJModelInterface.matrix(Xnew, transpose=true))
+        DMatrix = convert(Array{Float64, 2}, MMI.matrix(Xnew, transpose=true))
     end
 
     # TODO: Warn users if fitresult is from a `non-converged` fit?
@@ -147,7 +147,7 @@ function MLJModelInterface.transform(m::KMeans, fitresult, Xnew)
     centroids = results.centers
     distances = Distances.pairwise(Distances.SqEuclidean(), DMatrix, centroids; dims=2)
     preds = argmin.(eachrow(distances))
-    return MLJModelInterface.table(reshape(preds, :, 1), prototype=Xnew)
+    return MMI.table(reshape(preds, :, 1), prototype=Xnew)
 end
 
 
@@ -156,7 +156,7 @@ end
 ####
 
 # TODO 4: metadata for the package and for each of the model interfaces
-metadata_pkg.(KMeans,
+MMI.metadata_pkg.(KMeans,
     name = "ParallelKMeans",
     uuid = "42b8e9d4-006b-409a-8472-7f34b3fb58af",
     url  = "https://github.com/PyDataBlog/ParallelKMeans.jl",
@@ -166,9 +166,9 @@ metadata_pkg.(KMeans,
 
 
 # Metadata for ParaKMeans model interface
-metadata_model(KMeans,
-    input   = MLJModelInterface.Table(MLJModelInterface.Continuous),
-    output  = MLJModelInterface.Table(MLJModelInterface.Count),
+MMI.metadata_model(KMeans,
+    input   = MMI.Table(MMI.Continuous),
+    output  = MMI.Table(MMI.Count),
     weights = false,
     descr   = ParallelKMeans_Desc,
 	path	= "ParallelKMeans.KMeans")
diff --git a/test/test02_lloyd.jl b/test/test02_lloyd.jl
@@ -1,4 +1,4 @@
-module TestKMeans
+module TestLloyd
 
 using ParallelKMeans
 using Test

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-module TestKMeans`
	`1`	`+module TestLloyd`
`2`	`2`
`3`	`3`	`using ParallelKMeans`
`4`	`4`	`using Test`