Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Manifold learning notebooks #278

Merged
merged 9 commits into from
Jun 16, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 34 additions & 39 deletions lib/scholar/manifold/trimap.ex
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,10 @@ defmodule Scholar.Manifold.Trimap do
@opts_schema NimbleOptions.new!(opts_schema)

defnp tempered_log(x, t) do
if Nx.abs(t - 1) < 1.0e-5 do
if Nx.abs(t - 1.0) < 1.0e-5 do
Nx.log(x)
else
(x ** (1 - t) - 1) * (1 / (1 - t))
1.0 / (1.0 - t) * (x ** (1.0 - t) - 1.0)
end
end

Expand Down Expand Up @@ -181,7 +181,7 @@ defmodule Scholar.Manifold.Trimap do
{samples, key, _, _, _} =
while {samples, key, discard, rejects, i}, Nx.any(discard) do
{new_samples, key} = Nx.Random.randint(key, 0, opts[:maxval], shape: {elem(shape, 1)})
discard = in1d(new_samples, rejects[i])
discard = in1d(new_samples, rejects[i]) or in1d(new_samples, samples)
samples = Nx.select(discard, samples, new_samples)
{samples, key, in1d(samples, rejects[i]), rejects, i}
end
Expand Down Expand Up @@ -231,9 +231,9 @@ defmodule Scholar.Manifold.Trimap do
sim = triplets[[.., 1]]
out = triplets[[.., 2]]

p_sim = handle_dist(inputs[anc], inputs[sim], opts) / (sig[anc] * sig[sim])
p_sim = -(handle_dist(inputs[anc], inputs[sim], opts) ** 2) / (sig[anc] * sig[sim])

p_out = handle_dist(inputs[anc], inputs[out], opts) / (sig[anc] * sig[out])
p_out = -(handle_dist(inputs[anc], inputs[out], opts) ** 2) / (sig[anc] * sig[out])

flip = p_sim < p_out
weights = p_sim - p_out
Expand All @@ -255,7 +255,7 @@ defmodule Scholar.Manifold.Trimap do
hits = Nx.flatten(neighbors)

distances =
handle_dist(inputs[anchors], inputs[hits], opts) |> Nx.reshape({num_points, :auto})
(handle_dist(inputs[anchors], inputs[hits], opts) ** 2) |> Nx.reshape({num_points, :auto})

sigmas = Nx.max(Nx.mean(Nx.sqrt(distances[[.., 3..5]]), axes: [1]), 1.0e-10)

Expand All @@ -268,65 +268,58 @@ defmodule Scholar.Manifold.Trimap do
end

defnp find_triplet_weights(inputs, triplets, neighbors, sigmas, distances, opts \\ []) do
{num_points, num_inliners} = Nx.shape(neighbors)
{num_points, num_inliers} = Nx.shape(neighbors)

p_sim = -Nx.flatten(distances)

num_outliers = div(Nx.axis_size(triplets, 0), num_points * num_inliners)
num_outliers = div(Nx.axis_size(triplets, 0), num_points * num_inliers)

p_sim =
Nx.tile(Nx.reshape(p_sim, {num_points, num_inliners}), [1, num_outliers]) |> Nx.flatten()
Nx.tile(Nx.reshape(p_sim, {num_points, num_inliers}), [1, num_outliers]) |> Nx.flatten()

out_distances = handle_dist(inputs[triplets[[.., 0]]], inputs[triplets[[.., 2]]], opts)
out_distances = handle_dist(inputs[triplets[[.., 0]]], inputs[triplets[[.., 2]]], opts) ** 2

p_out = -out_distances / (sigmas[triplets[[.., 0]]] * sigmas[triplets[[.., 2]]])
p_sim - p_out
end

defnp generate_triplets(key, inputs, opts \\ []) do
num_inliners = opts[:num_inliers]
num_inliers = opts[:num_inliers]
num_random = opts[:num_random]
weight_temp = opts[:weight_temp]
num_points = Nx.axis_size(inputs, 0)
num_extra = min(num_inliners + 50, num_points)

nndescent =
Scholar.Neighbors.NNDescent.fit(inputs,
num_neighbors: num_extra,
tree_init?: false,
metric: opts[:metric],
tol: 1.0e-5
)
num_extra = min(num_inliers + 50, num_points)

neighbors = nndescent.nearest_neighbors
{neighbors, _distances} =
Scholar.Neighbors.LargeVis.fit(inputs, num_neighbors: num_extra, metric: opts[:metric])

neighbors = Nx.concatenate([Nx.iota({num_points, 1}), neighbors], axis: 1)

{knn_distances, neighbors, sigmas} = find_scaled_neighbors(inputs, neighbors, opts)

neighbors = neighbors[[.., 0..num_inliners]]
knn_distances = knn_distances[[.., 0..num_inliners]]
neighbors = neighbors[[.., 0..num_inliers]]
knn_distances = knn_distances[[.., 0..num_inliers]]

{triplets, key} =
sample_knn_triplets(key, neighbors,
num_outliers: opts[:num_outliers],
num_inliers: num_inliners,
num_inliers: num_inliers,
num_points: num_points
)

weights =
find_triplet_weights(
inputs,
triplets,
neighbors[[.., 1..num_inliners]],
neighbors[[.., 1..num_inliers]],
sigmas,
knn_distances[[.., 1..num_inliners]],
knn_distances[[.., 1..num_inliers]],
opts
)

flip = weights < 0
anchors = triplets[[.., 0]] |> Nx.reshape({:auto, 1})
pairs = triplets[[.., 1..2]]
pairs = triplets[[.., 1..-1//1]]

pairs =
Nx.select(
Expand Down Expand Up @@ -386,7 +379,7 @@ defmodule Scholar.Manifold.Trimap do
{loss, num_violated}
end

defn trimap_loss(embedding, triplets, weights) do
defn trimap_loss({embedding, triplets, weights}) do
{loss, _} = trimap_metrics(embedding, triplets, weights)
loss
end
Expand Down Expand Up @@ -439,21 +432,18 @@ defmodule Scholar.Manifold.Trimap do
{triplets, weights, key, applied_pca?} =
case triplets do
{} ->
inputs =
{inputs, applied_pca} =
if num_components > @dim_pca do
inputs = inputs - Nx.mean(inputs, axes: [0])
{u, s, vt} = Nx.LinAlg.SVD.svd(inputs, full_matrices: false)
inputs = Nx.dot(u[[.., 0..@dim_pca]] * s[0..@dim_pca], vt[[0..@dim_pca, ..]])

inputs = inputs - Nx.reduce_min(inputs)
inputs = inputs / Nx.reduce_max(inputs)
inputs - Nx.mean(inputs, axes: [0])
{inputs, Nx.u8(1)}
else
inputs
{inputs, Nx.u8(0)}
end

{triplets, weights, key} = generate_triplets(key, inputs, opts)
{triplets, weights, key, Nx.u8(1)}
{triplets, weights, key, applied_pca}

_ ->
{triplets, weights, key, Nx.u8(0)}
Expand All @@ -475,7 +465,10 @@ defmodule Scholar.Manifold.Trimap do

opts[:init_embedding_type] == 1 ->
{random_embedding, _key} =
Nx.Random.normal(key, shape: {num_points, opts[:num_components]})
Nx.Random.normal(key,
shape: {num_points, opts[:num_components]},
type: to_float_type(inputs)
)

random_embedding * @init_scale
end
Expand All @@ -493,10 +486,12 @@ defmodule Scholar.Manifold.Trimap do
{embeddings, _} =
while {embeddings, {vel, gain, lr, triplets, weights, i = Nx.s64(0)}},
i < opts[:num_iters] do
gamma = if i < @switch_iter, do: @final_momentum, else: @init_momentum
grad = trimap_loss(embeddings + gamma * vel, triplets, weights)
gamma = if i < @switch_iter, do: @init_momentum, else: @final_momentum

gradient =
grad(embeddings + gamma * vel, fn x -> trimap_loss({x, triplets, weights}) end)

{embeddings, vel, gain} = update_embedding_dbd(embeddings, grad, vel, gain, lr, i)
{embeddings, vel, gain} = update_embedding_dbd(embeddings, gradient, vel, gain, lr, i)

{embeddings, {vel, gain, lr, triplets, weights, i + 1}}
end
Expand Down
Binary file added notebooks/files/mammoth.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
155 changes: 155 additions & 0 deletions notebooks/manifold_learning.livemd
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
<!-- livebook:{"persist_outputs":true} -->

# Manifold Learning

```elixir
Mix.install([
{:scholar, github: "elixir-nx/scholar"},
{:explorer, "~> 0.8.2", override: true},
{:exla, "~> 0.7.2"},
{:nx, "~> 0.7.2"},
{:req, "~> 0.4.14"},
{:kino_vega_lite, "~> 0.1.11"},
{:kino, "~> 0.12.3"},
{:kino_explorer, "~> 0.1.18"},
{:tucan, "~> 0.3.1"}
])
```

## Setup

We will use `Explorer` in this notebook, so let's define aliase for its main module DataFrame:
msluszniak marked this conversation as resolved.
Show resolved Hide resolved

```elixir
require Explorer.DataFrame, as: DF
```

And let's configure `EXLA` as our default backend (where our tensors are stored) and compiler (which compiles Scholar code) across the notebook and all branched sections:

```elixir
Nx.global_default_backend(EXLA.Backend)
Nx.Defn.global_default_options(compiler: EXLA)
```

## Testing Manifold Learning Functionalities

In this notebook, we test how manifold learning algorithms works, and what results we can get from them.
msluszniak marked this conversation as resolved.
Show resolved Hide resolved

Firstly, let's fetch the dataset that we experiment on. The data represents 3D coordinates of a mammoth. Below we include a figure of original dataset.
msluszniak marked this conversation as resolved.
Show resolved Hide resolved

![](files/mammoth.png)

```elixir
source = "https://raw.githubusercontent.com/MNoichl/UMAP-examples-mammoth-/master/mammoth_a.csv"

data = Req.get!(source).body

df = DF.load_csv!(data)
```

Now, convert the dataframe into tensor, so we can manipulate the data using `Scholar`.

```elixir
tensor_data = Nx.stack(df, axis: 1)
```

Since there is almost 1 million data points and they are sorted, we shuffle dataset and then use only the part of the dataset.

We start with Trimap. It's a manifold learning algorithm that is based of nearest neighbors. It preserves the global structure of dataset, but it doesn't handle in a poroper way the local structure. Let's look what will be the result of the Trimap on mammoth dataset.
msluszniak marked this conversation as resolved.
Show resolved Hide resolved

```elixir
{tensor_data, key} = EXLA.jit(&Nx.Random.shuffle(&1, &2)).(Nx.Random.key(42), tensor_data)
msluszniak marked this conversation as resolved.
Show resolved Hide resolved

trimap_res =
EXLA.jit(
&Scholar.Manifold.Trimap.embed(&1,
key: &2,
num_components: 2,
num_inliers: 12,
num_outliers: 4,
weight_temp: 0.5,
learning_rate: 0.1,
metric: :squared_euclidean
)
).(
tensor_data[[0..10000, ..]],
Nx.Random.key(55)
)
```

Now, lets plot the results of Trimap algorithm

```elixir
coords = [
x: trimap_res[[.., 0]] |> Nx.to_flat_list(),
y: trimap_res[[.., 1]] |> Nx.to_flat_list()
]

Tucan.layers([
Tucan.scatter(coords, "x", "y", point_size: 1)
])
|> Tucan.set_size(300, 300)
|> Tucan.set_title(
"Mammoth dataset with reduced dimensionality using Trimap",
offset: 25
)
```

For sure, we can recognize mammoth on this picture. Trimap indeed preserved the global structure of data. Result is similar to the projection of 3D mammoth to the YZ plane. Now, plot this projection and compare these two plots.

```elixir
coords = [
x: tensor_data[[0..10000, 1]] |> Nx.to_flat_list(),
y: tensor_data[[0..10000, 2]] |> Nx.to_flat_list()
]

Tucan.layers([
Tucan.scatter(coords, "x", "y", point_size: 1)
])
|> Tucan.set_size(300, 300)
|> Tucan.set_title(
"Mammoth data set with reduced dimensionality using trimap",
offset: 25
)
```

These two plots are similiar but there are some important differences. Even if the second figure seems "prettier" it is less informative than the result of trimap. On the first figure, we can spot two tusks while one the second one they overlap and we see only one. Similarly, legs overlay on the first plot and one the second one they are spread and don't intersect with each other.

Now, lets try different algorithm: t-SNE
msluszniak marked this conversation as resolved.
Show resolved Hide resolved

```elixir
tsne_res =
EXLA.jit(
&Scholar.Manifold.TSNE.fit(&1,
key: &2,
num_components: 2,
perplexity: 125,
exaggeration: 10.0,
learning_rate: 500,
metric: :squared_euclidean
)
).(
tensor_data[[0..2000, ..]],
Nx.Random.key(55)
)
```

```elixir
coords = [
x: tsne_res[[.., 0]] |> Nx.to_flat_list(),
y: tsne_res[[.., 1]] |> Nx.to_flat_list()
]

Tucan.layers([
Tucan.scatter(coords, "x", "y", point_size: 1)
])
|> Tucan.set_size(300, 300)
|> Tucan.set_title(
"Mammoth dataset with reduced dimensionality using Trimap",
offset: 25
)
```

As we see, t-SNE have completely different results than trimap. This is because t-SNE has a completely different mathematical background of computation. Also t-SNE is slower algorithm, so it can't be used to such big datasets as trimap. However, t-SNE still preserve some features of mammoth like small tusks, feets, and corp. You can experiment with parameter *perplexity* which can significantly change the output of the algorithm.
msluszniak marked this conversation as resolved.
Show resolved Hide resolved

<!-- livebook:{"offset":4756,"stamp":{"token":"XCP.N4iPPNOOaf_v2YT6Ztr5xZap7jLZO88ChjT4Zo_vGiGvGM3Y2dj633k1VoBWjjgxmvxiOM2-XAZXsQFQWxNx6f4mfs6_Tt_Dq7WI_A","version":2}} -->
4 changes: 2 additions & 2 deletions test/scholar/manifold/trimap_test.exs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
defmodule Scholar.Manifold.TrimapTest do
use Scholar.Case, async: true
alias Scholar.Manifold.Trimap
doctest Trimap

Check failure on line 4 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.15.6, 26.1, true)

doctest Scholar.Manifold.Trimap.embed/2 (1) (Scholar.Manifold.TrimapTest)

Check failure on line 4 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.14.5, 26.1)

doctest Scholar.Manifold.Trimap.embed/2 (1) (Scholar.Manifold.TrimapTest)

describe "transform" do
test "non default num_inliers and num_outliers" do

Check failure on line 7 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.15.6, 26.1, true)

test transform non default num_inliers and num_outliers (Scholar.Manifold.TrimapTest)

Check failure on line 7 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.14.5, 26.1)

test transform non default num_inliers and num_outliers (Scholar.Manifold.TrimapTest)
x = Nx.iota({5, 6})
key = Nx.Random.key(42)
res = Trimap.embed(x, num_components: 2, key: key, num_inliers: 3, num_outliers: 1)
Expand All @@ -21,7 +21,7 @@
assert_all_close(res, expected, atol: 1.0e-3, rtol: 1.0e-3)
end

test "non default num_random, weight_temp, and learning_rate" do

Check failure on line 24 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.15.6, 26.1, true)

test transform non default num_random, weight_temp, and learning_rate (Scholar.Manifold.TrimapTest)

Check failure on line 24 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.14.5, 26.1)

test transform non default num_random, weight_temp, and learning_rate (Scholar.Manifold.TrimapTest)
x = Nx.iota({5, 6})
key = Nx.Random.key(42)

Expand All @@ -48,7 +48,7 @@
assert_all_close(res, expected, atol: 1.0e-3, rtol: 1.0e-3)
end

test "non default num_iters and init_embedding_type" do

Check failure on line 51 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.15.6, 26.1, true)

test transform non default num_iters and init_embedding_type (Scholar.Manifold.TrimapTest)

Check failure on line 51 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.14.5, 26.1)

test transform non default num_iters and init_embedding_type (Scholar.Manifold.TrimapTest)
x = Nx.iota({5, 6})
key = Nx.Random.key(42)

Expand All @@ -74,7 +74,7 @@
assert_all_close(res, expected, atol: 1.0e-1, rtol: 1.0e-1)
end

test "passed precomputed triplets and weights" do

Check failure on line 77 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.15.6, 26.1, true)

test transform passed precomputed triplets and weights (Scholar.Manifold.TrimapTest)

Check failure on line 77 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.14.5, 26.1)

test transform passed precomputed triplets and weights (Scholar.Manifold.TrimapTest)
x = Nx.iota({5, 6})
key = Nx.Random.key(42)
triplets = Nx.tensor([[0, 1, 2], [1, 2, 3], [2, 3, 4], [3, 4, 0], [4, 0, 1]])
Expand Down Expand Up @@ -102,7 +102,7 @@
assert_all_close(res, expected, atol: 1.0e-3, rtol: 1.0e-3)
end

test "passed initial_embedding" do

Check failure on line 105 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.15.6, 26.1, true)

test transform passed initial_embedding (Scholar.Manifold.TrimapTest)

Check failure on line 105 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.14.5, 26.1)

test transform passed initial_embedding (Scholar.Manifold.TrimapTest)
x = Nx.iota({5, 6})
key = Nx.Random.key(42)

Expand Down Expand Up @@ -136,7 +136,7 @@
assert_all_close(res, expected, atol: 1.0e-3, rtol: 1.0e-3)
end

test "metric set to manhattan" do
test "metric set to euclidean" do

Check failure on line 139 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.15.6, 26.1, true)

test transform metric set to euclidean (Scholar.Manifold.TrimapTest)

Check failure on line 139 in test/scholar/manifold/trimap_test.exs

View workflow job for this annotation

GitHub Actions / main (1.14.5, 26.1)

test transform metric set to euclidean (Scholar.Manifold.TrimapTest)
x = Nx.iota({5, 6})
key = Nx.Random.key(42)

Expand All @@ -146,7 +146,7 @@
key: key,
num_inliers: 3,
num_outliers: 1,
metric: :manhattan
metric: :euclidean
)

expected =
Expand Down
Loading