Skip to content

Commit d318618

Browse files
authored
feat(low-code concurrent): Add use_global_cursor flag to ConcurrentPerPartitionCursor (#279)
1 parent d9d93ab commit d318618

File tree

2 files changed

+128
-81
lines changed

2 files changed

+128
-81
lines changed

airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ def __init__(
9595
self._lookback_window: int = 0
9696
self._parent_state: Optional[StreamState] = None
9797
self._over_limit: int = 0
98+
self._use_global_cursor: bool = False
9899
self._partition_serializer = PerPartitionKeySerializer()
99100

100101
self._set_initial_state(stream_state)
@@ -105,16 +106,18 @@ def cursor_field(self) -> CursorField:
105106

106107
@property
107108
def state(self) -> MutableMapping[str, Any]:
108-
states = []
109-
for partition_tuple, cursor in self._cursor_per_partition.items():
110-
if cursor.state:
111-
states.append(
112-
{
113-
"partition": self._to_dict(partition_tuple),
114-
"cursor": copy.deepcopy(cursor.state),
115-
}
116-
)
117-
state: dict[str, Any] = {self._PERPARTITION_STATE_KEY: states}
109+
state: dict[str, Any] = {"use_global_cursor": self._use_global_cursor}
110+
if not self._use_global_cursor:
111+
states = []
112+
for partition_tuple, cursor in self._cursor_per_partition.items():
113+
if cursor.state:
114+
states.append(
115+
{
116+
"partition": self._to_dict(partition_tuple),
117+
"cursor": copy.deepcopy(cursor.state),
118+
}
119+
)
120+
state[self._PERPARTITION_STATE_KEY] = states
118121

119122
if self._global_cursor:
120123
state[self._GLOBAL_STATE_KEY] = self._global_cursor
@@ -147,7 +150,8 @@ def close_partition(self, partition: Partition) -> None:
147150
< cursor.state[self.cursor_field.cursor_field_key]
148151
):
149152
self._new_global_cursor = copy.deepcopy(cursor.state)
150-
self._emit_state_message()
153+
if not self._use_global_cursor:
154+
self._emit_state_message()
151155

152156
def ensure_at_least_one_state_emitted(self) -> None:
153157
"""
@@ -225,14 +229,18 @@ def _ensure_partition_limit(self) -> None:
225229
"""
226230
with self._lock:
227231
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
232+
self._over_limit += 1
228233
# Try removing finished partitions first
229234
for partition_key in list(self._cursor_per_partition.keys()):
230-
if partition_key in self._finished_partitions:
235+
if (
236+
partition_key in self._finished_partitions
237+
and self._semaphore_per_partition[partition_key]._value == 0
238+
):
231239
oldest_partition = self._cursor_per_partition.pop(
232240
partition_key
233241
) # Remove the oldest partition
234242
logger.warning(
235-
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
243+
f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self._over_limit}."
236244
)
237245
break
238246
else:
@@ -297,6 +305,8 @@ def _set_initial_state(self, stream_state: StreamState) -> None:
297305
self._new_global_cursor = deepcopy(stream_state)
298306

299307
else:
308+
self._use_global_cursor = stream_state.get("use_global_cursor", False)
309+
300310
self._lookback_window = int(stream_state.get("lookback_window", 0))
301311

302312
for state in stream_state.get(self._PERPARTITION_STATE_KEY, []):
@@ -320,6 +330,9 @@ def _set_initial_state(self, stream_state: StreamState) -> None:
320330
self._partition_router.set_initial_state(stream_state)
321331

322332
def observe(self, record: Record) -> None:
333+
if not self._use_global_cursor and self.limit_reached():
334+
self._use_global_cursor = True
335+
323336
if not record.associated_slice:
324337
raise ValueError(
325338
"Invalid state as stream slices that are emitted should refer to an existing cursor"
@@ -358,3 +371,6 @@ def _get_cursor(self, record: Record) -> ConcurrentCursor:
358371
)
359372
cursor = self._cursor_per_partition[partition_key]
360373
return cursor
374+
375+
def limit_reached(self) -> bool:
376+
return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER

unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py

Lines changed: 99 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2-
2+
import copy
33
from copy import deepcopy
44
from datetime import datetime, timedelta
55
from typing import Any, List, Mapping, MutableMapping, Optional, Union
@@ -721,6 +721,7 @@ def _run_read(
721721
"cursor": {"created_at": VOTE_300_CREATED_AT},
722722
},
723723
],
724+
"use_global_cursor": False,
724725
"lookback_window": 1,
725726
"parent_state": {},
726727
"state": {"created_at": VOTE_100_CREATED_AT},
@@ -1121,6 +1122,7 @@ def run_incremental_parent_state_test(
11211122
}
11221123
},
11231124
"lookback_window": 1,
1125+
"use_global_cursor": False,
11241126
"states": [
11251127
{
11261128
"partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}},
@@ -1170,8 +1172,66 @@ def test_incremental_parent_state(
11701172
)
11711173

11721174

1175+
STATE_MIGRATION_EXPECTED_STATE = {
1176+
"state": {"created_at": VOTE_100_CREATED_AT},
1177+
"parent_state": {
1178+
"post_comments": {
1179+
"use_global_cursor": False,
1180+
"state": {"updated_at": COMMENT_10_UPDATED_AT},
1181+
"parent_state": {"posts": {"updated_at": POST_1_UPDATED_AT}},
1182+
"lookback_window": 1,
1183+
"states": [
1184+
{
1185+
"partition": {"id": 1, "parent_slice": {}},
1186+
"cursor": {"updated_at": COMMENT_10_UPDATED_AT},
1187+
},
1188+
{
1189+
"partition": {"id": 2, "parent_slice": {}},
1190+
"cursor": {"updated_at": COMMENT_20_UPDATED_AT},
1191+
},
1192+
{
1193+
"partition": {"id": 3, "parent_slice": {}},
1194+
"cursor": {"updated_at": COMMENT_30_UPDATED_AT},
1195+
},
1196+
],
1197+
}
1198+
},
1199+
"lookback_window": 1,
1200+
"use_global_cursor": False,
1201+
"states": [
1202+
{
1203+
"partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}},
1204+
"cursor": {"created_at": VOTE_100_CREATED_AT},
1205+
},
1206+
{
1207+
"partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}},
1208+
"cursor": {"created_at": VOTE_111_CREATED_AT},
1209+
},
1210+
{
1211+
"partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}},
1212+
"cursor": {"created_at": PARTITION_SYNC_START_TIME},
1213+
},
1214+
{
1215+
"partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}},
1216+
"cursor": {"created_at": VOTE_200_CREATED_AT},
1217+
},
1218+
{
1219+
"partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}},
1220+
"cursor": {"created_at": VOTE_210_CREATED_AT},
1221+
},
1222+
{
1223+
"partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}},
1224+
"cursor": {"created_at": VOTE_300_CREATED_AT},
1225+
},
1226+
],
1227+
}
1228+
STATE_MIGRATION_GLOBAL_EXPECTED_STATE = copy.deepcopy(STATE_MIGRATION_EXPECTED_STATE)
1229+
del STATE_MIGRATION_GLOBAL_EXPECTED_STATE["states"]
1230+
STATE_MIGRATION_GLOBAL_EXPECTED_STATE["use_global_cursor"] = True
1231+
1232+
11731233
@pytest.mark.parametrize(
1174-
"test_name, manifest, mock_requests, expected_records, expected_state",
1234+
"test_name, manifest, mock_requests, expected_records",
11751235
[
11761236
(
11771237
"test_incremental_parent_state",
@@ -1326,80 +1386,45 @@ def test_incremental_parent_state(
13261386
"id": 300,
13271387
},
13281388
],
1329-
# Expected state
1389+
),
1390+
],
1391+
)
1392+
@pytest.mark.parametrize(
1393+
"initial_state, expected_state",
1394+
[
1395+
({"created_at": PARTITION_SYNC_START_TIME}, STATE_MIGRATION_EXPECTED_STATE),
1396+
(
13301397
{
1331-
"state": {"created_at": VOTE_100_CREATED_AT},
1398+
"state": {"created_at": PARTITION_SYNC_START_TIME},
1399+
"lookback_window": 0,
1400+
"use_global_cursor": False,
13321401
"parent_state": {
13331402
"post_comments": {
1334-
"use_global_cursor": False,
1335-
"state": {"updated_at": COMMENT_10_UPDATED_AT},
1336-
"parent_state": {"posts": {"updated_at": POST_1_UPDATED_AT}},
1337-
"lookback_window": 1,
1338-
"states": [
1339-
{
1340-
"partition": {"id": 1, "parent_slice": {}},
1341-
"cursor": {"updated_at": COMMENT_10_UPDATED_AT},
1342-
},
1343-
{
1344-
"partition": {"id": 2, "parent_slice": {}},
1345-
"cursor": {"updated_at": COMMENT_20_UPDATED_AT},
1346-
},
1347-
{
1348-
"partition": {"id": 3, "parent_slice": {}},
1349-
"cursor": {"updated_at": COMMENT_30_UPDATED_AT},
1350-
},
1351-
],
1403+
"state": {"updated_at": PARTITION_SYNC_START_TIME},
1404+
"parent_state": {"posts": {"updated_at": PARTITION_SYNC_START_TIME}},
1405+
"lookback_window": 0,
13521406
}
13531407
},
1354-
"lookback_window": 1,
1355-
"states": [
1356-
{
1357-
"partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}},
1358-
"cursor": {"created_at": VOTE_100_CREATED_AT},
1359-
},
1360-
{
1361-
"partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}},
1362-
"cursor": {"created_at": VOTE_111_CREATED_AT},
1363-
},
1364-
{
1365-
"partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}},
1366-
"cursor": {"created_at": PARTITION_SYNC_START_TIME},
1367-
},
1368-
{
1369-
"partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}},
1370-
"cursor": {"created_at": VOTE_200_CREATED_AT},
1371-
},
1372-
{
1373-
"partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}},
1374-
"cursor": {"created_at": VOTE_210_CREATED_AT},
1375-
},
1376-
{
1377-
"partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}},
1378-
"cursor": {"created_at": VOTE_300_CREATED_AT},
1379-
},
1380-
],
13811408
},
1409+
STATE_MIGRATION_EXPECTED_STATE,
13821410
),
1383-
],
1384-
)
1385-
@pytest.mark.parametrize(
1386-
"initial_state",
1387-
[
1388-
{"created_at": PARTITION_SYNC_START_TIME},
1389-
{
1390-
"state": {"created_at": PARTITION_SYNC_START_TIME},
1391-
"lookback_window": 0,
1392-
"use_global_cursor": True,
1393-
"parent_state": {
1394-
"post_comments": {
1395-
"state": {"updated_at": PARTITION_SYNC_START_TIME},
1396-
"parent_state": {"posts": {"updated_at": PARTITION_SYNC_START_TIME}},
1397-
"lookback_window": 0,
1398-
}
1411+
(
1412+
{
1413+
"state": {"created_at": PARTITION_SYNC_START_TIME},
1414+
"lookback_window": 0,
1415+
"use_global_cursor": True,
1416+
"parent_state": {
1417+
"post_comments": {
1418+
"state": {"updated_at": PARTITION_SYNC_START_TIME},
1419+
"parent_state": {"posts": {"updated_at": PARTITION_SYNC_START_TIME}},
1420+
"lookback_window": 0,
1421+
}
1422+
},
13991423
},
1400-
},
1424+
STATE_MIGRATION_GLOBAL_EXPECTED_STATE,
1425+
),
14011426
],
1402-
ids=["legacy_python_format", "low_code_global_format"],
1427+
ids=["legacy_python_format", "low_code_per_partition_state", "low_code_global_format"],
14031428
)
14041429
def test_incremental_parent_state_migration(
14051430
test_name, manifest, mock_requests, expected_records, initial_state, expected_state
@@ -1510,6 +1535,7 @@ def test_incremental_parent_state_migration(
15101535
],
15111536
"state": {"created_at": INITIAL_GLOBAL_CURSOR},
15121537
"lookback_window": 1,
1538+
"use_global_cursor": False,
15131539
},
15141540
),
15151541
],
@@ -1677,13 +1703,14 @@ def test_incremental_parent_state_no_slices(
16771703
"cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR},
16781704
},
16791705
],
1680-
"use_global_cursor": True,
1706+
"use_global_cursor": False,
16811707
"state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR},
16821708
"lookback_window": 0,
16831709
},
16841710
# Expected state
16851711
{
16861712
"lookback_window": 1,
1713+
"use_global_cursor": False,
16871714
"state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR},
16881715
"states": [
16891716
{
@@ -1953,6 +1980,7 @@ def test_incremental_parent_state_no_records(
19531980
},
19541981
"state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR},
19551982
"lookback_window": 86400,
1983+
"use_global_cursor": False,
19561984
"states": [
19571985
{
19581986
"partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}},
@@ -2220,6 +2248,7 @@ def test_incremental_substream_error(
22202248
},
22212249
# Expected state
22222250
{
2251+
"use_global_cursor": False,
22232252
"lookback_window": 1,
22242253
"state": {"updated_at": "2024-01-25T00:00:00Z"},
22252254
"states": [
@@ -2318,6 +2347,7 @@ def test_incremental_list_partition_router(
23182347
# Expected state
23192348
{
23202349
"lookback_window": 0,
2350+
"use_global_cursor": False,
23212351
"state": {"updated_at": "2024-01-08T00:00:00Z"},
23222352
"states": [
23232353
{"cursor": {"updated_at": "2024-01-20T00:00:00Z"}, "partition": {"id": "1"}},
@@ -2845,6 +2875,7 @@ def test_incremental_error(
28452875
}
28462876
},
28472877
"lookback_window": 1,
2878+
"use_global_cursor": False,
28482879
"states": [
28492880
{
28502881
"partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}},

0 commit comments

Comments
 (0)