|
10 | 10 | from typing import (
|
11 | 11 | Any,
|
12 | 12 | Callable,
|
| 13 | + Dict, |
13 | 14 | Iterable,
|
14 | 15 | List,
|
15 | 16 | Mapping,
|
@@ -367,14 +368,65 @@ def _read_pages(
|
367 | 368 | {"next_page_token": initial_token} if initial_token is not None else None
|
368 | 369 | )
|
369 | 370 | while not pagination_complete:
|
370 |
| - response = self._fetch_next_page(stream_state, stream_slice, next_page_token) |
| 371 | + property_chunks: List[List[str]] = ( |
| 372 | + list( |
| 373 | + self.additional_query_properties.get_request_property_chunks( |
| 374 | + stream_slice=stream_slice |
| 375 | + ) |
| 376 | + ) |
| 377 | + if self.additional_query_properties |
| 378 | + else [ |
| 379 | + [] |
| 380 | + ] # A single empty property chunk represents the case where property chunking is not configured |
| 381 | + ) |
371 | 382 |
|
| 383 | + merged_records: MutableMapping[str, Any] = defaultdict(dict) |
372 | 384 | last_page_size = 0
|
373 | 385 | last_record: Optional[Record] = None
|
374 |
| - for record in records_generator_fn(response): |
375 |
| - last_page_size += 1 |
376 |
| - last_record = record |
377 |
| - yield record |
| 386 | + response: Optional[requests.Response] = None |
| 387 | + for properties in property_chunks: |
| 388 | + if len(properties) > 0: |
| 389 | + stream_slice = StreamSlice( |
| 390 | + partition=stream_slice.partition or {}, |
| 391 | + cursor_slice=stream_slice.cursor_slice or {}, |
| 392 | + extra_fields={"query_properties": properties}, |
| 393 | + ) |
| 394 | + |
| 395 | + response = self._fetch_next_page(stream_state, stream_slice, next_page_token) |
| 396 | + for current_record in records_generator_fn(response): |
| 397 | + if ( |
| 398 | + current_record |
| 399 | + and self.additional_query_properties |
| 400 | + and self.additional_query_properties.property_chunking |
| 401 | + ): |
| 402 | + merge_key = ( |
| 403 | + self.additional_query_properties.property_chunking.get_merge_key( |
| 404 | + current_record |
| 405 | + ) |
| 406 | + ) |
| 407 | + if merge_key: |
| 408 | + _deep_merge(merged_records[merge_key], current_record) |
| 409 | + else: |
| 410 | + # We should still emit records even if the record did not have a merge key |
| 411 | + last_page_size += 1 |
| 412 | + last_record = current_record |
| 413 | + yield current_record |
| 414 | + else: |
| 415 | + last_page_size += 1 |
| 416 | + last_record = current_record |
| 417 | + yield current_record |
| 418 | + |
| 419 | + if ( |
| 420 | + self.additional_query_properties |
| 421 | + and self.additional_query_properties.property_chunking |
| 422 | + ): |
| 423 | + for merged_record in merged_records.values(): |
| 424 | + record = Record( |
| 425 | + data=merged_record, stream_name=self.name, associated_slice=stream_slice |
| 426 | + ) |
| 427 | + last_page_size += 1 |
| 428 | + last_record = record |
| 429 | + yield record |
378 | 430 |
|
379 | 431 | if not response:
|
380 | 432 | pagination_complete = True
|
@@ -449,110 +501,43 @@ def read_records(
|
449 | 501 | :param stream_slice: The stream slice to read data for
|
450 | 502 | :return: The records read from the API source
|
451 | 503 | """
|
452 |
| - |
453 |
| - property_chunks = ( |
454 |
| - list( |
455 |
| - self.additional_query_properties.get_request_property_chunks( |
456 |
| - stream_slice=stream_slice |
457 |
| - ) |
458 |
| - ) |
459 |
| - if self.additional_query_properties |
460 |
| - else [] |
461 |
| - ) |
462 |
| - records_without_merge_key = [] |
463 |
| - merged_records: MutableMapping[str, Any] = defaultdict(dict) |
464 |
| - |
465 | 504 | _slice = stream_slice or StreamSlice(partition={}, cursor_slice={}) # None-check
|
| 505 | + |
466 | 506 | most_recent_record_from_slice = None
|
| 507 | + record_generator = partial( |
| 508 | + self._parse_records, |
| 509 | + stream_slice=stream_slice, |
| 510 | + stream_state=self.state or {}, |
| 511 | + records_schema=records_schema, |
| 512 | + ) |
467 | 513 |
|
468 |
| - if self.additional_query_properties: |
469 |
| - for properties in property_chunks: |
470 |
| - _slice = StreamSlice( |
471 |
| - partition=_slice.partition or {}, |
472 |
| - cursor_slice=_slice.cursor_slice or {}, |
473 |
| - extra_fields={"query_properties": properties}, |
474 |
| - ) # None-check |
475 |
| - |
476 |
| - record_generator = partial( |
477 |
| - self._parse_records, |
478 |
| - stream_slice=_slice, |
479 |
| - stream_state=self.state or {}, |
480 |
| - records_schema=records_schema, |
481 |
| - ) |
| 514 | + if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor): |
| 515 | + stream_state = self.state |
482 | 516 |
|
483 |
| - for stream_data in self._read_pages(record_generator, self.state, _slice): |
484 |
| - current_record = self._extract_record(stream_data, _slice) |
485 |
| - if self.cursor and current_record: |
486 |
| - self.cursor.observe(_slice, current_record) |
| 517 | + # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to |
| 518 | + # fetch more records. The platform deletes stream state for full refresh streams before starting a |
| 519 | + # new job, so we don't need to worry about this value existing for the initial attempt |
| 520 | + if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY): |
| 521 | + return |
487 | 522 |
|
488 |
| - # Latest record read, not necessarily within slice boundaries. |
489 |
| - # TODO Remove once all custom components implement `observe` method. |
490 |
| - # https://github.com/airbytehq/airbyte-internal-issues/issues/6955 |
491 |
| - most_recent_record_from_slice = self._get_most_recent_record( |
492 |
| - most_recent_record_from_slice, current_record, _slice |
493 |
| - ) |
| 523 | + yield from self._read_single_page(record_generator, stream_state, _slice) |
| 524 | + else: |
| 525 | + for stream_data in self._read_pages(record_generator, self.state, _slice): |
| 526 | + current_record = self._extract_record(stream_data, _slice) |
| 527 | + if self.cursor and current_record: |
| 528 | + self.cursor.observe(_slice, current_record) |
| 529 | + |
| 530 | + # Latest record read, not necessarily within slice boundaries. |
| 531 | + # TODO Remove once all custom components implement `observe` method. |
| 532 | + # https://github.com/airbytehq/airbyte-internal-issues/issues/6955 |
| 533 | + most_recent_record_from_slice = self._get_most_recent_record( |
| 534 | + most_recent_record_from_slice, current_record, _slice |
| 535 | + ) |
| 536 | + yield stream_data |
494 | 537 |
|
495 |
| - if current_record and self.additional_query_properties.property_chunking: |
496 |
| - merge_key = ( |
497 |
| - self.additional_query_properties.property_chunking.get_merge_key( |
498 |
| - current_record |
499 |
| - ) |
500 |
| - ) |
501 |
| - if merge_key: |
502 |
| - merged_records[merge_key].update(current_record) |
503 |
| - else: |
504 |
| - # We should still emit records even if the record did not have a merge key |
505 |
| - records_without_merge_key.append(current_record) |
506 |
| - else: |
507 |
| - yield stream_data |
508 | 538 | if self.cursor:
|
509 | 539 | self.cursor.close_slice(_slice, most_recent_record_from_slice)
|
510 |
| - |
511 |
| - if len(merged_records) > 0: |
512 |
| - yield from [ |
513 |
| - Record(data=merged_record, stream_name=self.name, associated_slice=stream_slice) |
514 |
| - for merged_record in merged_records.values() |
515 |
| - ] |
516 |
| - if len(records_without_merge_key) > 0: |
517 |
| - yield from records_without_merge_key |
518 |
| - else: |
519 |
| - _slice = stream_slice or StreamSlice(partition={}, cursor_slice={}) # None-check |
520 |
| - |
521 |
| - most_recent_record_from_slice = None |
522 |
| - record_generator = partial( |
523 |
| - self._parse_records, |
524 |
| - stream_slice=stream_slice, |
525 |
| - stream_state=self.state or {}, |
526 |
| - records_schema=records_schema, |
527 |
| - ) |
528 |
| - |
529 |
| - if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor): |
530 |
| - stream_state = self.state |
531 |
| - |
532 |
| - # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to |
533 |
| - # fetch more records. The platform deletes stream state for full refresh streams before starting a |
534 |
| - # new job, so we don't need to worry about this value existing for the initial attempt |
535 |
| - if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY): |
536 |
| - return |
537 |
| - |
538 |
| - yield from self._read_single_page(record_generator, stream_state, _slice) |
539 |
| - else: |
540 |
| - for stream_data in self._read_pages(record_generator, self.state, _slice): |
541 |
| - current_record = self._extract_record(stream_data, _slice) |
542 |
| - if self.cursor and current_record: |
543 |
| - self.cursor.observe(_slice, current_record) |
544 |
| - |
545 |
| - # Latest record read, not necessarily within slice boundaries. |
546 |
| - # TODO Remove once all custom components implement `observe` method. |
547 |
| - # https://github.com/airbytehq/airbyte-internal-issues/issues/6955 |
548 |
| - most_recent_record_from_slice = self._get_most_recent_record( |
549 |
| - most_recent_record_from_slice, current_record, _slice |
550 |
| - ) |
551 |
| - yield stream_data |
552 |
| - |
553 |
| - if self.cursor: |
554 |
| - self.cursor.close_slice(_slice, most_recent_record_from_slice) |
555 |
| - return |
| 540 | + return |
556 | 541 |
|
557 | 542 | def _get_most_recent_record(
|
558 | 543 | self,
|
@@ -639,6 +624,26 @@ def _to_partition_key(to_serialize: Any) -> str:
|
639 | 624 | return json.dumps(to_serialize, indent=None, separators=(",", ":"), sort_keys=True)
|
640 | 625 |
|
641 | 626 |
|
| 627 | +def _deep_merge( |
| 628 | + target: MutableMapping[str, Any], source: Union[Record, MutableMapping[str, Any]] |
| 629 | +) -> None: |
| 630 | + """ |
| 631 | + Recursively merge two dictionaries, combining nested dictionaries instead of overwriting them. |
| 632 | +
|
| 633 | + :param target: The dictionary to merge into (modified in place) |
| 634 | + :param source: The dictionary to merge from |
| 635 | + """ |
| 636 | + for key, value in source.items(): |
| 637 | + if ( |
| 638 | + key in target |
| 639 | + and isinstance(target[key], MutableMapping) |
| 640 | + and isinstance(value, MutableMapping) |
| 641 | + ): |
| 642 | + _deep_merge(target[key], value) |
| 643 | + else: |
| 644 | + target[key] = value |
| 645 | + |
| 646 | + |
642 | 647 | @dataclass
|
643 | 648 | class SimpleRetrieverTestReadDecorator(SimpleRetriever):
|
644 | 649 | """
|
|
0 commit comments