Skip to content

Commit 181cbed

Browse files
authored
[#234] Fix data loss in Replicated cache when multiple nodes join sequentially (#235)
1 parent 45ed60f commit 181cbed

File tree

2 files changed

+45
-14
lines changed

2 files changed

+45
-14
lines changed

lib/nebulex/adapters/replicated.ex

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -755,20 +755,15 @@ defmodule Nebulex.Adapters.Replicated.Bootstrap do
755755

756756
nodes ->
757757
# Sync process:
758-
# 1. Push a new generation on all cluster nodes to make the newer one
759-
# empty.
760-
# 2. Copy cached data from one of the cluster nodes; entries will be
761-
# streamed from the older generation since the newer one should be
762-
# empty.
763-
# 3. Push a new generation on the current/new node to make it a mirror
764-
# of the other cluster nodes.
765-
# 4. Reset GC timer for ell cluster nodes (making the generation timer
766-
# gap among cluster nodes as small as possible).
767-
with :ok <- maybe_run_on_nodes(adapter_meta, nodes, :new_generation),
768-
:ok <- copy_entries_from_nodes(adapter_meta, nodes),
769-
:ok <- maybe_run_on_nodes(adapter_meta, [node()], :new_generation) do
770-
maybe_run_on_nodes(adapter_meta, nodes, :reset_generation_timer)
771-
end
758+
#
759+
# 1. Copy cached data from existing cluster nodes to the joining node.
760+
# At this point, both the newer and older generations are still present,
761+
# so data may be copied from both, ensuring no data is lost due to premature
762+
# generation rotation.
763+
# 2. Reset the generation GC timer on all nodes to synchronize their GC intervals,
764+
# minimizing timer gaps and ensuring consistent generation rotation across the cluster.
765+
:ok = copy_entries_from_nodes(adapter_meta, nodes)
766+
maybe_run_on_nodes(adapter_meta, cluster_nodes, :reset_generation_timer)
772767
end
773768
end
774769

test/nebulex/adapters/replicated_test.exs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,42 @@ defmodule Nebulex.Adapters.ReplicatedTest do
219219
:ok = stop_caches(node_pid_list)
220220
end)
221221
end
222+
223+
test "cache data is not lost when multiple nodes join sequentially" do
224+
# Start with existing nodes
225+
assert Replicated.put(:a, 1) == :ok
226+
assert Replicated.get(:a) == 1
227+
228+
# Add node 3
229+
node3 = [:"[email protected]"]
230+
pids = start_caches(node3, [{Replicated, [name: @cache_name]}])
231+
232+
wait_until(fn ->
233+
assert Replicated.nodes() |> :lists.usort() == :lists.usort(cluster_nodes() ++ node3)
234+
end)
235+
236+
# Verify :a is still there on all nodes
237+
wait_until(10, 1000, fn ->
238+
assert_for_all_replicas(Replicated, :get, [:a], 1)
239+
end)
240+
241+
# Add node 4
242+
node4 = [:"[email protected]"]
243+
pids = pids ++ start_caches(node4, [{Replicated, [name: @cache_name]}])
244+
245+
wait_until(fn ->
246+
assert Replicated.nodes() |> :lists.usort() ==
247+
:lists.usort(cluster_nodes() ++ node3 ++ node4)
248+
end)
249+
250+
# Now check that data is still present
251+
wait_until(10, 1000, fn ->
252+
assert_for_all_replicas(Replicated, :get, [:a], 1)
253+
end)
254+
255+
# Clean up
256+
:ok = stop_caches(pids)
257+
end
222258
end
223259

224260
describe "write-like operations locked" do

0 commit comments

Comments
 (0)