JuliaDynamics · Datseris · Oct 5, 2023 · Sep 30, 2023 · Sep 30, 2023 · Sep 30, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,6 @@
+# v1.12
+- New algorithm `GroupViaPairwiseComparison` to group features in `AttractorsViaFeaturizing`. Simpler, typically faster and using less memory than DBSCAN, it can be useful in well-behaved systems.
+
 # v1.11
 - New algorithm `subdivision_based_grid`. Allows user to automatically constuct the grid which simulates subdivision into regions with different discretization levels in accordance with state space flow speed.
 

diff --git a/Project.toml b/Project.toml
@@ -2,7 +2,7 @@ name = "Attractors"
 uuid = "f3fd9213-ca85-4dba-9dfd-7fc91308fec7"
 authors = ["George Datseris <[email protected]>", "Kalel Rossi", "Alexandre Wagemakers"]
 repo = "https://github.com/JuliaDynamics/Attractors.jl.git"
-version = "1.11.0"
+version = "1.12.0"
 
 [deps]
 BlackBoxOptim = "a134a8b2-14d6-55f6-9291-3336d3ab0209"

diff --git a/src/mapping/grouping/all_grouping_configs.jl b/src/mapping/grouping/all_grouping_configs.jl
@@ -12,6 +12,7 @@ Currently available grouping configurations are:
 - [`GroupViaClustering`](@ref)
 - [`GroupViaNearestFeature`](@ref)
 - [`GroupViaHistogram`](@ref)
+- [`GroupViaPairwiseComparison`](@ref)
 
 `GroupingConfig` defines an extendable interface.
 The only thing necessary for a new grouping configuration is to:
@@ -54,3 +55,4 @@ end
 include("cluster_config.jl")
 include("histogram_config.jl")
 include("nearest_feature_config.jl")
+include("pairwise_comparison.jl")
diff --git a/src/mapping/grouping/pairwise_comparison.jl b/src/mapping/grouping/pairwise_comparison.jl
@@ -0,0 +1,116 @@
+export GroupViaPairwiseComparison
+
+"""
+    GroupViaPairwiseComparison(; distance_threshold::Real, kwargs...)
+
+Initialize a struct that contains instructions on how to group features in
+[`AttractorsViaFeaturizing`](@ref). `GroupViaPairwiseComparison` groups features and
+identifies clusters by considering the pairwise distance between features. It can be used
+as an alternative to the clustering method in `GroupViaClustering`, having the
+advantage that it is simpler, typically faster and uses less memory. 
+
+## Keyword arguments
+
+* `distance_threshold`: a real number defining the maximum distance two features can be to
+  be considered in the same cluster - above the threshold, features are different. This
+  value simply needs to be large enough to differentiate clusters.
+* `distance_metric = Euclidean()`: A metric to be used in the clustering. It can be any
+  function `f(a, b)` that returns the distance between any type of data structure (usually
+  vectors or matrices of reals). Needs to be consistent with the `featurizer` function. All
+  metrics from Distances.jl can be used here.
+* `rescale_features = true`: if true, rescale each dimension of the extracted features
+  separately into the range `[0,1]`. This typically leads to more accurate clustering.
+
+## Description
+This algorithm assumes that the features are well-separated into distinct clouds, with the
+maximum radius of the cloud controlled by `distance_threshold`. Since the systems are
+deterministic, this is achievable with a good-enough `featurizer` function, by removing
+transients, and running the trajectories for sufficiently long. It then considers that
+features belong to the same attractor when their pairwise distance, computed using
+`distance_metric`, is smaller than or equal to `distance_threshold`, and that they belong
+to different attractors when the distance is bigger. Attractors correspond to each
+grouping of similar features. In this way, the key parameter `distance_threshold` is
+basically the amount of variation permissible in the features belonging to the same
+attractor. If they are well-chosen, the value can be relatively small and does not need to
+be fine tuned. 
+
+The `distance_threshold` should achieve a balance: one one hand, it should be large enough
+to account for variations in the features from the same attractor - if it's not large
+enough, the algorithm will find duplicate attractors. On the other hand, it should be
+small enough to not group together features from distinct attractors. This requires some
+knowledge of how spread the features are. If it's too big, the algorithm will miss some
+attractors, as it groups 2+ distinct attractors together. Therefore, as a rule of thumb,
+one can repeat the procedure a few times, starting with a relatively large value and
+reducing it until no more attractors are found and no duplicates appear.
+
+The method uses relatively little memory, as it only stores vectors whose size is on order
+of the number of attractors of the system. 
+"""
+struct GroupViaPairwiseComparison{R<:Real, M} <: GroupingConfig
+    distance_threshold::R
+    distance_metric::M
+    rescale_features::Bool 
+end
+
+function GroupViaPairwiseComparison(;
+        distance_threshold, #impossible to set a good default value, depends on the features
+        distance_metric=Euclidean(), rescale_features=false, 
+    )
+    return GroupViaPairwiseComparison(
+        distance_threshold,
+        distance_metric, rescale_features,
+    )
+end
+
+function group_features(
+    features, config::GroupViaPairwiseComparison;
+    kwargs...
+)
+    if config.rescale_features
+        features = _rescale_to_01(features)
+    end
+
+    labels = _cluster_features_into_labels(features, config, config.distance_threshold; kwargs...)
+    return labels
+end
+
+# TODO: add support for par_weight,plength and spp in the computation of the distance metric?
+function _cluster_features_into_labels(features, config::GroupViaPairwiseComparison, distance_threshold::Real; kwargs...)
+    labels_features = Vector{Int64}(undef, length(features)) #labels of all features
+    metric = config.distance_metric
+
+    # Assign feature 1 as a new attractor
+    labels_features[1] = 1
+    cluster_idxs = [1] # idxs of the features that define each cluster
+    cluster_labels = [1] # labels for the clusters, going from 1 : num_clusters
+    next_cluster_label = 2
+
+    for idx_feature = 2:length(features)
+        feature = features[idx_feature]
+        dist_to_clusters = _distance_dict(feature, features, cluster_idxs, cluster_labels, metric; kwargs...)
+        min_dist, closest_cluster_label = findmin(dist_to_clusters)
+
+        if min_dist > distance_threshold #bigger than threshold => new attractor
+            feature_label = next_cluster_label
+            push!(cluster_idxs, idx_feature)
+            push!(cluster_labels, next_cluster_label)
+            # @info "New attractor $next_cluster_label, min dist was $min_dist > $distance_threshold" #TODO: allow this when debugging verbose mode on!
+            next_cluster_label += 1
+        else #smaller than threshold => assign to closest cluster 
+            feature_label = closest_cluster_label
+        end
+
+        labels_features[idx_feature] = feature_label 
+    end 
+    return labels_features
+end
+
+function _distance_dict(feature, features, cluster_idxs, cluster_labels, metric; kwargs...)
+    if metric isa Metric
+        dist_to_clusters = Dict(cluster_label => evaluate(metric, feature, features[cluster_idxs[idx_cluster]]) for (idx_cluster, cluster_label) in enumerate(cluster_labels))
+    else
+        dist_to_clusters = Dict(cluster_label => metric(feature, features[cluster_idxs[idx_cluster]]) for (idx_cluster, cluster_label) in enumerate(cluster_labels))
+    end
+
+    return dist_to_clusters 
+end
diff --git a/test/mapping/attractor_mapping.jl b/test/mapping/attractor_mapping.jl
@@ -5,7 +5,6 @@
 # If not, a small, but representative subset of mappers and dynamical systems is used.
 
 DO_EXTENSIVE_TESTS = get(ENV, "ATTRACTORS_EXTENSIVE_TESTS", "false") == "true"
-
 using Test
 using Attractors
 using LinearAlgebra
@@ -16,7 +15,8 @@ using Statistics
 # Define generic testing framework
 function test_basins(ds, u0s, grid, expected_fs_raw, featurizer;
         rerr = 1e-3, ferr = 1e-3, aerr = 1e-15, ε = nothing, max_distance = Inf,
-        proximity_test = true,
+        proximity_test = true, pairwise_comparison_matrix_test = false, featurizer_matrix = nothing,
+        distance_threshold_pairwise = 1,
         kwargs... # kwargs are propagated to recurrences
     )
     # u0s is Vector{Pair}
@@ -89,6 +89,29 @@ function test_basins(ds, u0s, grid, expected_fs_raw, featurizer;
         )
     end
 
+    @testset "Featurizing, pairwise comparison" begin
+        config = GroupViaPairwiseComparison(; distance_threshold=distance_threshold_pairwise,
+        distance_metric=Euclidean(), rescale_features=false)
+        mapper = AttractorsViaFeaturizing(ds, featurizer, config; Ttr = 500)
+        test_basins_fractions(mapper;
+            err = ferr, single_u_mapping = false, known_ids = [-1, 1, 2, 3]
+        )
+    end
+
+    if pairwise_comparison_matrix_test
+        @testset "Featurizing, pairwise comparison, matrix features" begin
+            function distance_metric_hausdorff(A,B)
+                set_distance(A, B, Hausdorff())
+            end
+            config = GroupViaPairwiseComparison(; distance_threshold=distance_threshold_pairwise,
+            distance_metric=distance_metric_hausdorff, rescale_features=false)
+            mapper = AttractorsViaFeaturizing(ds, featurizer_matrix, config; Ttr = 500)
+            test_basins_fractions(mapper;
+                err = ferr, single_u_mapping = false, known_ids = [-1, 1, 2, 3]
+            )
+        end
+    end
+
     @testset "Featurizing, nearest feature" begin
         # First generate the templates
         function features_from_u(u)
@@ -134,7 +157,7 @@ end
         return any(isinf, x) ? SVector(200.0, 200.0) : x
     end
     test_basins(ds, u0s, grid, expected_fs_raw, featurizer;
-    max_distance = 20, ε = 1e-3, proximity_test = false)
+    max_distance = 20, ε = 1e-3, proximity_test = false, distance_threshold_pairwise=1)
 end
 
 @testset "Magnetic pendulum: projected system" begin
@@ -181,7 +204,11 @@ end
         return SVector(A[end][1], A[end][2])
     end
 
-    test_basins(ds, u0s, grid, expected_fs_raw, featurizer; ε = 0.2, Δt = 1.0, ferr=1e-2)
+    function featurizer_matrix(A, t)
+        return A
+    end
+
+    test_basins(ds, u0s, grid, expected_fs_raw, featurizer; ε = 0.2, Δt = 1.0, ferr=1e-2, featurizer_matrix, pairwise_comparison_matrix_test=true, distance_threshold_pairwise=1)
 end
 
 # Okay, all of these aren't fundamentally new tests.
@@ -218,9 +245,9 @@ if DO_EXTENSIVE_TESTS
             g = exp(entropy(Renyi(; q = 0), probs))
             return SVector(g, minimum(A[:,1]))
         end
-
+        
         test_basins(ds, u0s, grid, expected_fs_raw, featurizer;
-        ε = 0.01, ferr=1e-2, Δt = 0.2, mx_chk_att = 20)
+        ε = 0.01, ferr=1e-2, Δt = 0.2, mx_chk_att = 20, distance_threshold_pairwise=100) #threshold is very high because features haven't really converged yet here
     end
 
     @testset "Duffing oscillator: stroboscopic map" begin
@@ -247,7 +274,7 @@ if DO_EXTENSIVE_TESTS
         end
 
         test_basins(ds, u0s, grid, expected_fs_raw, featurizer;
-        ε = 1.0, ferr=1e-2, rerr = 1e-2, aerr = 5e-3)
+        ε = 1.0, ferr=1e-2, rerr = 1e-2, aerr = 5e-3, distance_threshold_pairwise=1)
     end
 
 
@@ -282,5 +309,5 @@ if DO_EXTENSIVE_TESTS
 
         test_basins(pmap, u0s, grid, expected_fs_raw, thomas_featurizer; ε = 1.0, ferr=1e-2)
     end
-
+    
 end