Finalized coresets and tests

Andrey Oskin · Andrey Oskin · commit 9e6a174b13c1 · 2020-04-27T10:35:46.000+03:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "ParallelKMeans"
 uuid = "42b8e9d4-006b-409a-8472-7f34b3fb58af"
 authors = ["Bernard Brenyah", "Andrey Oskin"]
-version = "0.1.5"
+version = "0.1.6"
 
 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -72,15 +72,16 @@ git checkout experimental
 - [X] Implementation of [Hamerly implementation](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster).
 - [X] Interface for inclusion in Alan Turing Institute's [MLJModels](https://github.com/alan-turing-institute/MLJModels.jl#who-is-this-repo-for).
 - [X] Full Implementation of Triangle inequality based on [Elkan - 2003 Using the Triangle Inequality to Accelerate K-Means"](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf).
-- [X] Implementation of [Yinyang K-Means: A Drop-In Replacement of the Classic K-Means with Consistent Speedup](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf)
+- [X] Implementation of [Yinyang K-Means: A Drop-In Replacement of the Classic K-Means with Consistent Speedup](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf).
+- [X] Implementation of [Coresets](http://proceedings.mlr.press/v51/lucic16-supp.pdf).
 - [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf).
+- [X] Support for weighted K-means.
 - [ ] Support for other distance metrics supported by [Distances.jl](https://github.com/JuliaStats/Distances.jl#supported-distances).
 - [ ] Support of MLJ Random generation hyperparameter.
 - [ ] Native support for tabular data inputs outside of MLJModels' interface.
 - [ ] Refactoring and finalizaiton of API desgin.
 - [ ] GPU support.
 - [ ] Distributed calculations support.
-- [ ] Implementation of other K-Means algorithm variants based on recent literature.
 - [ ] Optimization of code base.
 - [ ] Improved Documentation
 - [ ] More benchmark tests.
@@ -123,6 +124,7 @@ r.converged             # whether the procedure converged
 - [Hamerly()](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster) - Hamerly is good for moderate number of clusters (< 50?) and moderate dimensions (<100?).
 - [Elkan()](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf) - Recommended for high dimensional data.
 - [Yinyang()](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf) - Recommended for large dimensions and/or large number of clusters.
+- [Coreset()](http://proceedings.mlr.press/v51/lucic16-supp.pdf) - Recommended for very fast clustering of very large datasets, when extreme accuracy is not important.
 - [Geometric()](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf) - (Coming soon)
 - [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - (Coming soon)
 
@@ -204,6 +206,7 @@ ________________________________________________________________________________
 - 0.1.3 Faster & optimized execution.
 - 0.1.4 Bug fixes.
 - 0.1.5 Added `Yinyang` algorithm.
+- 0.1.6 Added support for weighted k-means; Added `Coreset` algorithm; improved support for different types of the design matrix.
 
 ## Contributing
 
diff --git a/src/ParallelKMeans.jl b/src/ParallelKMeans.jl
@@ -17,6 +17,6 @@ include("mlj_interface.jl")
 include("coreset.jl")
 
 export kmeans
-export Lloyd, Hamerly, Elkan, Yinyang, Coreset
+export Lloyd, Hamerly, Elkan, Yinyang, 阴阳, Coreset
 
 end # module
diff --git a/src/coreset.jl b/src/coreset.jl
@@ -3,22 +3,37 @@
 
 Coreset algorithm implementation, based on "Lucic, Mario & Bachem,
 Olivier & Krause, Andreas. (2015). Strong Coresets for Hard and Soft Bregman
-Clustering with Applications to Exponential Family Mixtures. "
+Clustering with Applications to Exponential Family Mixtures."
+
+`Coreset` supports following arguments:
+- `m`: default 100, subsample size
+- `alg`: default `Lloyd()`, algorithm used to clusterize sample
 
 It can be used directly in `kmeans` function
 
 ```julia
 X = rand(30, 100_000)   # 100_000 random points in 30 dimensions
 
-kmeans(Coreset(), X, 3) # 3 clusters, Coreset algorithm
+# 3 clusters, Coreset algorithm with default Lloyd algorithm and 100 subsamples
+kmeans(Coreset(), X, 3)
+
+# 3 clusters, Coreset algorithm with Hamerly algorithm and 500 subsamples
+kmeans(Coreset(m = 500, alg = Hamerly()), X, 3)
+kmeans(Coreset(500, Hamerly()), X, 3)
+
+# alternatively short form can be used for defining subsample size or algorithm only
+kmeans(Coreset(500), X, 3) # sample of the size 500, Lloyd clustering algorithm
+kmeans(Coreset(Hamerly()), X, 3) # sample of the size 100, Hamerly clustering algorithm
 ```
 """
 struct Coreset{T <: AbstractKMeansAlg} <: AbstractKMeansAlg
     m::Int
     alg::T
 end
 
-Coreset() = Coreset(100, Lloyd())
+Coreset(; m = 100, alg = Lloyd()) = Coreset(m, alg)
+Coreset(m::Int) = Coreset(m, Lloyd())
+Coreset(alg::AbstractKMeansAlg) = Coreset(100, alg)
 
 function kmeans!(alg::Coreset, containers, X, k, weights;
                 n_threads = Threads.nthreads(),
diff --git a/src/elkan.jl b/src/elkan.jl
@@ -23,7 +23,7 @@ function kmeans!(alg::Elkan, containers, X, k, weights;
                 k_init = "k-means++", max_iters = 300,
                 tol = eltype(X)(1e-6), verbose = false, init = nothing)
     nrow, ncol = size(X)
-    centroids = init == nothing ? smart_init(X, k, n_threads, init=k_init).centroids : deepcopy(init)
+    centroids = init == nothing ? smart_init(X, k, n_threads, weights, init=k_init).centroids : deepcopy(init)
 
     update_containers(alg, containers, centroids, n_threads)
     @parallelize n_threads ncol chunk_initialize(alg, containers, centroids, X, weights)
diff --git a/src/hamerly.jl b/src/hamerly.jl
@@ -23,7 +23,7 @@ function kmeans!(alg::Hamerly, containers, X, k, weights;
                 k_init = "k-means++", max_iters = 300,
                 tol = eltype(X)(1e-6), verbose = false, init = nothing)
     nrow, ncol = size(X)
-    centroids = init == nothing ? smart_init(X, k, n_threads, init=k_init).centroids : deepcopy(init)
+    centroids = init == nothing ? smart_init(X, k, n_threads, weights, init=k_init).centroids : deepcopy(init)
 
     @parallelize n_threads ncol chunk_initialize(alg, containers, centroids, X, weights)
 
diff --git a/src/lloyd.jl b/src/lloyd.jl
@@ -19,7 +19,7 @@ function kmeans!(alg::Lloyd, containers, X, k, weights;
                 k_init = "k-means++", max_iters = 300,
                 tol = eltype(design_matrix)(1e-6), verbose = false, init = nothing)
     nrow, ncol = size(X)
-    centroids = isnothing(init) ? smart_init(X, k, n_threads, init=k_init).centroids : deepcopy(init)
+    centroids = isnothing(init) ? smart_init(X, k, n_threads, weights, init=k_init).centroids : deepcopy(init)
 
     T = eltype(X)
     converged = false
diff --git a/src/seeding.jl b/src/seeding.jl
@@ -10,16 +10,18 @@ end
 
 
 """
-    chunk_colwise!(target, x, y, r)
+    chunk_colwise!(target, x, y, i, weights, r, idx)
 
-Utility function for calculation of the `colwise!(target, x, y, n_threads)` function.
+Utility function for the calculation of the weighted distance between points `x` and
+centroid vector `y[:, i]`.
 UnitRange argument `r` select subarray of original design matrix `x` that is going
 to be processed.
 """
-function chunk_colwise(target, x, y, i, r, idx)
+function chunk_colwise(target, x, y, i, weights, r, idx)
     T = eltype(x)
     @inbounds for j in r
         dist = distance(x, y, j, i)
+        dist = isnothing(weights) ? dist : weights[j] * dist
         target[j] = dist < target[j] ? dist : target[j]
     end
 end
@@ -35,7 +37,7 @@ of centroids from X used if any other string is attempted.
 
 A named tuple representing centroids and indices respecitively is returned.
 """
-function smart_init(X, k, n_threads = Threads.nthreads();
+function smart_init(X, k, n_threads = Threads.nthreads(), weights = nothing;
         init = "k-means++")
 
     nrow, ncol = size(X)
@@ -50,7 +52,7 @@ function smart_init(X, k, n_threads = Threads.nthreads();
         # TODO relax constraints on distances, may be should
         # define `X` as X::AbstractArray{T} where {T <: Number}
         # and use this T for all calculations.
-        rand_idx = rand(1:ncol)
+        rand_idx = isnothing(weights) ? rand(1:ncol) : wsample(1:ncol, weights)
         rand_indices[1] = rand_idx
         @inbounds for j in axes(X, 1)
             centroids[j, 1] = X[j, rand_idx]
@@ -61,7 +63,7 @@ function smart_init(X, k, n_threads = Threads.nthreads();
         distances = fill(T(Inf), ncol)
 
         # compute distances from the first centroid chosen to all the other data points
-        @parallelize n_threads ncol chunk_colwise(distances, X, centroids, 1)
+        @parallelize n_threads ncol chunk_colwise(distances, X, centroids, 1, weights)
         distances[rand_idx] = zero(T)
 
         for i = 2:k
@@ -77,7 +79,7 @@ function smart_init(X, k, n_threads = Threads.nthreads();
             i == k && break
 
             # compute distances from the centroids to all data points
-            @parallelize n_threads ncol chunk_colwise(distances, X, centroids, i)
+            @parallelize n_threads ncol chunk_colwise(distances, X, centroids, i, weights)
 
             distances[r_idx] = zero(T)
         end
diff --git a/src/yinyang.jl b/src/yinyang.jl
@@ -8,34 +8,51 @@ Conference on Machine Learning, ICML 2015, Lille, France, 6-11 July 2015"
 Generally it outperform `Hamerly` algorithm and has roughly the same time as `Elkan`
 algorithm with much lower memory consumption.
 
+
+`Yinyang` supports following arguments:
+`auto`: `Bool`, indicates whether to perform automated or manual grouping
+`group_size`: `Int`, estimation of average number of clusters per group. Lower numbers
+corresponds to higher calculation speed and higher memory consumption and vice versa.
+
 It can be used directly in `kmeans` function
 
 ```julia
 X = rand(30, 100_000)   # 100_000 random points in 30 dimensions
 
-kmeans(Yinyang(), X, 3) # 3 clusters, Yinyang algorithm
-```
+# 3 clusters, Yinyang algorithm, with deault 7 group_size
+kmeans(Yinyang(), X, 3)
 
-`Yinyang` supports following arguments:
-`auto`: `Bool`, indicates whether to perform automated or manual grouping
-`group_size`: `Int`, estimation of average number of clusters per group. Lower numbers
-corresponds to higher calculation speed and higher memory consumption and vice versa.
+# Following are equivalent
+# 3 clusters, Yinyang algorithm with 10 group_size
+kmeans(Yinyang(group_size = 10), X, 3)
+kmeans(Yinyang(10), X, 3)
+
+# One group with the size of the number of points
+kmeans(Yinyang(auto = false), X, 3)
+kmeans(Yinyang(false), X, 3)
+
+# Chinese writing can be used
+kmeans(阴阳(), X, 3)
+```
 """
 struct Yinyang <: AbstractKMeansAlg
     auto::Bool
     group_size::Int
 end
 
-Yinyang() = Yinyang(true, 7)
 Yinyang(auto::Bool) = Yinyang(auto, 7)
 Yinyang(group_size::Int) = Yinyang(true, group_size)
+Yinyang(; group_size = 7, auto = true) = Yinyang(auto, group_size)
+阴阳(auto::Bool) = Yinyang(auto, 7)
+阴阳(group_size::Int) = Yinyang(true, group_size)
+阴阳(; group_size = 7, auto = true) = Yinyang(auto, group_size)
 
 function kmeans!(alg::Yinyang, containers, X, k, weights;
                 n_threads = Threads.nthreads(),
                 k_init = "k-means++", max_iters = 300,
                 tol = 1e-6, verbose = false, init = nothing)
     nrow, ncol = size(X)
-    centroids = init == nothing ? smart_init(X, k, n_threads, init=k_init).centroids : deepcopy(init)
+    centroids = init == nothing ? smart_init(X, k, n_threads, weights, init=k_init).centroids : deepcopy(init)
 
     # create initial groups of centers, step 1 in original paper
     initialize(alg, containers, centroids, n_threads)
diff --git a/test/test01_distance.jl b/test/test01_distance.jl
@@ -1,22 +1,27 @@
 module TestDistance
-using ParallelKMeans: colwise!
+using ParallelKMeans: chunk_colwise, @parallelize
 using Test
 
 @testset "naive singlethread colwise" begin
     X = [1.0 3.0 4.0; 2.0 5.0 6.0]
-    y = [1.0, 2.0]
-    r = Vector{Float64}(undef, 3)
+    y = permutedims([1.0, 2.0]')
+    ncol = size(X, 2)
+    r = fill(Inf, ncol)
+    n_threads = 1
 
-    colwise!(r, X, y, 1)
+    @parallelize n_threads ncol chunk_colwise(r, X, y, 1, nothing)
     @test all(r .≈ [0.0, 13.0, 25.0])
 end
 
 @testset "multithread colwise" begin
     X = [1.0 3.0 4.0; 2.0 5.0 6.0]
-    y = [1.0, 2.0]
-    r = Vector{Float64}(undef, 3)
+    y = permutedims([1.0, 2.0]')
+    ncol = size(X, 2)
+    r = fill(Inf, ncol)
+    n_threads = 2
+
+    @parallelize n_threads ncol chunk_colwise(r, X, y, 1, nothing)
 
-    colwise!(r, X, y, 2)
     @test all(r .≈ [0.0, 13.0, 25.0])
 end
 
diff --git a/test/test02_kmpp.jl b/test/test02_kmpp.jl
diff --git a/test/test03_lloyd.jl b/test/test03_lloyd.jl
@@ -3,6 +3,7 @@ module TestLloyd
 using ParallelKMeans
 using Test
 using Random
+using StatsBase
 
 @testset "basic kmeans" begin
     X = [1. 2. 4.;]
@@ -69,4 +70,36 @@ end
     @test res.iterations == 11
 end
 
+@testset "Lloyd test weighted X" begin
+    Random.seed!(2020)
+    X = rand(3, 100)
+    weights = rand(100)
+
+    init = sample(1:100, 10, replace = false)
+    init = X[:, init]
+
+    res = kmeans(Lloyd(), X, 10, weights; init = init, n_threads = 1, tol = 1e-10, max_iters = 100, verbose = false)
+    @test res.totalcost ≈ 2.726538026486045
+    @test res.converged
+    @test res.iterations == 9
+
+    Random.seed!(2020)
+    X = rand(3, 100)
+    weights = rand(100)
+
+    res = kmeans(Lloyd(), X, 10, weights; n_threads = 1, tol = 1e-10, max_iters = 100, verbose = false)
+    @test res.totalcost ≈ 2.75774704578635
+    @test res.converged
+    @test res.iterations == 9
+
+    Random.seed!(2020)
+    X = rand(3, 100)
+    weights = rand(100)
+
+    res = kmeans(Lloyd(), X, 10, weights; n_threads = 1, tol = 1e-10, max_iters = 100, verbose = false)
+    @test res.totalcost ≈ 2.75774704578635
+    @test res.converged
+    @test res.iterations == 9
+end
+
 end # module
diff --git a/test/test04_elkan.jl b/test/test04_elkan.jl
@@ -69,4 +69,30 @@ end
     @test res.iterations == 11
 end
 
+@testset "Elkan weights support" begin
+    Random.seed!(2020)
+    X = rand(3, 100)
+    weights = rand(100)
+
+    baseline = kmeans(Lloyd(), X, 10, weights; tol = 1e-10, verbose = false)
+
+    Random.seed!(2020)
+    X = rand(3, 100)
+    weights = rand(100)
+
+    res = kmeans(Elkan(), X, 10, weights; tol = 1e-10, verbose = false)
+    @test res.totalcost ≈ baseline.totalcost
+    @test res.converged
+    @test res.iterations == baseline.iterations
+
+    Random.seed!(2020)
+    X = rand(3, 100)
+    weights = rand(100)
+
+    res = kmeans(Elkan(), X, 10, weights; n_threads = 2, tol = 1e-10, verbose = false)
+    @test res.totalcost ≈ baseline.totalcost
+    @test res.converged
+    @test res.iterations == baseline.iterations
+end
+
 end # module
diff --git a/test/test05_hamerly.jl b/test/test05_hamerly.jl
@@ -11,7 +11,7 @@ using Random
     nrow, ncol = size(X)
     containers = ParallelKMeans.create_containers(Hamerly(), X, 3, nrow, ncol, 1)
 
-    ParallelKMeans.chunk_initialize(Hamerly(), containers, centroids, X, 1:ncol, 1)
+    ParallelKMeans.chunk_initialize(Hamerly(), containers, centroids, X, nothing, 1:ncol, 1)
     @test containers.lb == [18.0, 20.0, 5.0, 5.0]
     @test containers.ub == [0.0, 2.0, 0.0, 0.0]
 end
@@ -81,4 +81,30 @@ end
     @test res.iterations == 11
 end
 
+@testset "Hamerly weights support" begin
+    Random.seed!(2020)
+    X = rand(3, 100)
+    weights = rand(100)
+
+    baseline = kmeans(Lloyd(), X, 10, weights; tol = 1e-10, verbose = false)
+
+    Random.seed!(2020)
+    X = rand(3, 100)
+    weights = rand(100)
+
+    res = kmeans(Hamerly(), X, 10, weights; tol = 1e-10, verbose = false)
+    @test res.totalcost ≈ baseline.totalcost
+    @test res.converged
+    @test res.iterations == baseline.iterations
+
+    Random.seed!(2020)
+    X = rand(3, 100)
+    weights = rand(100)
+
+    res = kmeans(Hamerly(), X, 10, weights; n_threads = 2, tol = 1e-10, verbose = false)
+    @test res.totalcost ≈ baseline.totalcost
+    @test res.converged
+    @test res.iterations == baseline.iterations
+end
+
 end # module
diff --git a/test/test06_yinyang.jl b/test/test06_yinyang.jl
diff --git a/test/test07_coreset.jl b/test/test07_coreset.jl
diff --git a/test/test70_verbose.jl b/test/test70_verbose.jl
diff --git a/test/test80_mlj_interface.jl b/test/test80_mlj_interface.jl