Merge pull request #28 from Yuan-Ru-Lin/add-affinity-propagation

ablaom · web-flow · commit 4e3c06fee8d6 · 2024-12-10T16:04:15.000+11:00
Initial commit for implementation of Affinity Propagation
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,7 +15,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6'
+          - '1.10'
           - '1'
         os:
           - ubuntu-latest
diff --git a/Project.toml b/Project.toml
@@ -6,10 +6,14 @@ version = "0.1.11"
 [deps]
 Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 Clustering = "0.15"
 Distances = "0.9, 0.10"
+LinearAlgebra = "1"
 MLJModelInterface = "1.4"
-julia = "1.6"
+StatsBase = "0.34"
+julia = "1.10"
diff --git a/src/MLJClusteringInterface.jl b/src/MLJClusteringInterface.jl
@@ -13,10 +13,12 @@ import MLJModelInterface: Continuous, Count, Finite, Multiclass, Table, OrderedF
     @mlj_model, metadata_model, metadata_pkg
 
 using Distances
+using LinearAlgebra
+using StatsBase
 
 # ===================================================================
 ## EXPORTS
-export KMeans, KMedoids, DBSCAN, HierarchicalClustering
+export KMeans, KMedoids, AffinityPropagation, DBSCAN, HierarchicalClustering
 
 # ===================================================================
 ## CONSTANTS
@@ -95,7 +97,6 @@ function MMI.transform(model::KMedoids, fitresult, X)
     return MMI.table(X̃, prototype=X)
 end
 
-
 # # PREDICT FOR K_MEANS AND K_MEDOIDS
 
 function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew)
@@ -208,10 +209,66 @@ end
 
 MMI.reporting_operations(::Type{<:HierarchicalClustering}) = (:predict,)
 
+# # AFFINITY_PROPAGATION
+
+@mlj_model mutable struct AffinityPropagation <: MMI.Static
+    damp::Float64 = 0.5::(0.0 ≤ _ < 1.0)
+    maxiter::Int = 200::(_ > 0)
+    tol::Float64 = 1e-6::(_ > 0)
+    preference::Union{Nothing,Float64} = nothing
+    metric::SemiMetric = SqEuclidean()
+end
+
+function MMI.predict(model::AffinityPropagation, ::Nothing, X)
+    Xarray = MMI.matrix(X)'
+
+    # Compute similarity matrix using negative pairwise distances
+    S = -pairwise(model.metric, Xarray, dims=2)
+
+    diagonal_element = if !isnothing(model.preference)
+        model.preference
+    else
+        # Get the median out of all pairs of similarity, that is, values above
+        # the diagonal line.
+        # Such default choice is mentioned in the algorithm's wiki article
+        iuppertri = triu!(trues(size(S)),1)
+        median(S[iuppertri])
+    end
+
+    fill!(view(S, diagind(S)), diagonal_element)
+
+    result = Cl.affinityprop(
+        S,
+        maxiter=model.maxiter,
+        tol=model.tol,
+        damp=model.damp
+    )
+
+    # Get number of clusters and labels
+    exemplars = result.exemplars
+    k = length(exemplars)
+    cluster_labels = MMI.categorical(1:k)
+
+    # Store exemplar points as centers (similar to KMeans/KMedoids)
+    centers = view(Xarray, :, exemplars)
+
+    report = (
+        exemplars=exemplars,
+        centers=centers,
+        cluster_labels=cluster_labels,
+        iterations=result.iterations,
+        converged=result.converged
+    )
+
+    return MMI.categorical(result.assignments), report
+end
+
+MMI.reporting_operations(::Type{<:AffinityPropagation}) = (:predict,)
+
 # # METADATA
 
 metadata_pkg.(
-    (KMeans, KMedoids, DBSCAN, HierarchicalClustering),
+    (KMeans, KMedoids, DBSCAN, HierarchicalClustering, AffinityPropagation),
     name="Clustering",
     uuid="aaaa29a8-35af-508c-8bc3-b662a17a0fe5",
     url="https://github.com/JuliaStats/Clustering.jl",
@@ -251,6 +308,13 @@ metadata_model(
     path = "$(PKG).HierarchicalClustering"
 )
 
+metadata_model(
+    AffinityPropagation,
+    human_name = "Affinity Propagation clusterer",
+    input_scitype = MMI.Table(Continuous),
+    path = "$(PKG).AffinityPropagation"
+)
+
 """
 $(MMI.doc_header(KMeans))
 
@@ -618,4 +682,73 @@ report(mach).cutter(h = 2.5)
 """
 HierarchicalClustering
 
+"""
+$(MMI.doc_header(AffinityPropagation))
+
+[Affinity Propagation](https://en.wikipedia.org/wiki/Affinity_propagation) is a clustering algorithm based on the concept of "message passing" between data points. More information is available at the [Clustering.jl documentation](https://juliastats.org/Clustering.jl/stable/index.html). Use `predict` to get cluster assignments. Indices of the exemplars, their values, etc, are accessed from the machine report (see below).
+
+This is a static implementation, i.e., it does not generalize to new data instances, and
+there is no training data. For clusterers that do generalize, see [`KMeans`](@ref) or
+[`KMedoids`](@ref).
+
+In MLJ or MLJBase, create a machine with
+
+    mach = machine(model)
+
+# Hyper-parameters
+
+- `damp = 0.5`: damping factor
+
+- `maxiter = 200`: maximum number of iteration
+
+- `tol = 1e-6`: tolerance for converenge
+
+- `preference = nothing`: the (single float) value of the diagonal elements of the similarity matrix. If unspecified, choose median (negative) similarity of all pairs as mentioned [here](https://en.wikipedia.org/wiki/Affinity_propagation#Algorithm)
+
+- `metric = Distances.SqEuclidean()`: metric (see `Distances.jl` for available metrics)
+
+# Operations
+
+- `predict(mach, X)`: return cluster label assignments, as an unordered
+  `CategoricalVector`. Here `X` is any table of input features (eg, a `DataFrame`) whose
+  columns are of scitype `Continuous`; check column scitypes with `schema(X)`.
+
+# Report
+
+After calling `predict(mach)`, the fields of `report(mach)`  are:
+
+- exemplars: indices of the data picked as exemplars in `X`
+
+- centers: positions of the exemplars in the feature space
+
+- cluster_labels: labels of clusters given to each datum in `X`
+
+- iterations: the number of iteration run by the algorithm
+
+- converged: whether or not the algorithm converges by the maximum iteration
+
+# Examples
+
+```
+using MLJ
+
+X, labels = make_moons(400, noise=0.9, rng=1)
+
+AffinityPropagation = @load AffinityPropagation pkg=Clustering
+model = AffinityPropagation(preference=-10.0)
+mach = machine(model)
+
+# compute and output cluster assignments for observations in `X`:
+yhat = predict(mach, X)
+
+# Get the positions of the exemplars
+report(mach).centers
+
+# Plot clustering result
+using GLMakie
+scatter(MLJ.matrix(X)', color=yhat.refs)
+```
+"""
+AffinityPropagation
+
 end # module
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -150,8 +150,40 @@ end
     @test report(mach).dendrogram.heights == dendro.heights
 end
 
+# # AffinityPropagation
+
+@testset "AffinityPropagation" begin
+    X = table(stack(Iterators.partition(0.5:0.5:20, 5))')
+
+    # Test case 1: preference == median (negative) similarity (i.e. unspecified)
+    mach = machine(AffinityPropagation())
+
+    yhat = predict(mach, X)
+    @test yhat == [1, 1, 1, 1, 2, 2, 2, 2]
+
+    _report = report(mach)
+    @test _report.exemplars == [2, 7]
+    @test _report.centers == [3.0 15.5; 3.5 16.0; 4.0 16.5; 4.5 17.0; 5.0 17.5]
+    @test _report.cluster_labels == [1, 2]
+    @test _report.iterations == 50
+    @test _report.converged == true
+
+    # Test case 2: |preference| too large
+    mach2 = machine(AffinityPropagation(preference=-20.0))
+
+    yhat = predict(mach2, X)
+    @test yhat == [1, 2, 3, 4, 5, 6, 7, 8]
+
+    _report = report(mach2)
+    @test _report.exemplars == [1, 2, 3, 4, 5, 6, 7, 8]
+    @test _report.centers == matrix(X)'
+    @test _report.cluster_labels == [1, 2, 3, 4, 5, 6, 7, 8]
+    @test _report.iterations == 32
+    @test _report.converged == true
+end
+
 @testset "MLJ interface" begin
-    models = [KMeans, KMedoids, DBSCAN, HierarchicalClustering]
+    models = [KMeans, KMedoids, DBSCAN, HierarchicalClustering, AffinityPropagation]
     failures, summary = MLJTestInterface.test(
         models,
         X;