Skip to content

Quotation denormalization #203

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 30 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
44f2326
Some tests pass
johnml1135 Apr 2, 2025
90f3c93
Added more test framework
johnml1135 Apr 3, 2025
8e49f0b
Basic implementation and tests for quote convention detection
Apr 8, 2025
101d3e2
Initial working version of quotation denormalization
Apr 11, 2025
f7ba4e5
I want to process the data in segments that correspond to individual …
johnml1135 Apr 3, 2025
541ba1c
Updates for reviewer comments
johnml1135 Apr 10, 2025
56f1ae5
Respond to reviewer comments
johnml1135 Apr 11, 2025
f5bf1ee
Additional denormalization tests (not all passing)
Apr 14, 2025
f0c08cb
Rebase + additional denormalization tests
Apr 15, 2025
3adbb5e
Improved handling for NLLB-produced quote errors
Apr 25, 2025
3542952
Unit tests for basic quotation mark resolver
May 8, 2025
aef202c
Unit tests for UsfmStructureExtractor
May 9, 2025
a704d61
Unit tests for several quotation mark analysis classes
Jun 6, 2025
7451a72
Fix a bug related to verse markers before quotation marks
Jun 6, 2025
b1d1f28
Refactoring to allow arbitrary quote convention changes + more unit t…
Jun 20, 2025
a594d75
Remaining unit tests
Jun 27, 2025
defe7dc
Change update_usfm_parser_handler.py to match main branch
Jun 28, 2025
904edad
Fix linting for test_quote_convention_detector.py
Jun 28, 2025
550d8a9
Address reviewer comments + refactor
Jul 1, 2025
83d89b8
Correct TextSegment equality function
Jul 2, 2025
a24ef2b
Damien's requested code-review changes
Jul 9, 2025
f2f929a
Eli's requested code-review changes
Jul 10, 2025
38d3f05
One code review change that was left out of the previous commit
Jul 10, 2025
043c802
Pass metadata through update block (#202)
Enkidu93 Jul 2, 2025
59fcb8f
Add marker behavior to metadata for marker placement, fix bug related…
isaac091 Jul 15, 2025
ab6e853
Report additional ClearML progress via user properties (#205)
pmachapman Jul 16, 2025
bc735e2
Add support to add tags to the ClearML task via build_options (#208)
pmachapman Jul 16, 2025
f4ad9ea
Update machine library to 1.7.4 (#209)
Enkidu93 Jul 16, 2025
ee23dca
Fixes for Eli's hopefully final code review comments
Jul 17, 2025
7e7d577
Fix typing issue in tests for QuotationMarkUpdateFirstPass
Jul 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .dbl_bundle_text_corpus import DblBundleTextCorpus
from .dictionary_alignment_corpus import DictionaryAlignmentCorpus
from .dictionary_text_corpus import DictionaryTextCorpus
from .fallback_quotation_mark_resolver import FallbackQuotationMarkResolver
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
from .file_paratext_project_text_updater import FileParatextProjectTextUpdater
from .flatten import flatten
Expand All @@ -24,6 +25,13 @@
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
from .paratext_text_corpus import ParatextTextCorpus
from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
from .quotation_mark_denormalization_first_pass import QuotationMarkDenormalizationFirstPass
from .quotation_mark_denormalization_usfm_update_block_handler import QuotationMarkDenormalizationUsfmUpdateBlockHandler
from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass
from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings
from .quotation_mark_update_settings import QuotationMarkUpdateSettings
from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy
from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler
from .scripture_element import ScriptureElement
from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
Expand Down Expand Up @@ -51,7 +59,12 @@
normalize,
unescape_spaces,
)
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior
from .update_usfm_parser_handler import (
UpdateUsfmMarkerBehavior,
UpdateUsfmParserHandler,
UpdateUsfmRow,
UpdateUsfmTextBehavior,
)
from .usfm_file_text import UsfmFileText
from .usfm_file_text_corpus import UsfmFileTextCorpus
from .usfm_memory_text import UsfmMemoryText
Expand Down Expand Up @@ -81,6 +94,7 @@
"AlignmentCollection",
"AlignmentCorpus",
"AlignmentRow",
"FallbackQuotationMarkResolver",
"batch",
"Corpus",
"create_versification_ref_corpus",
Expand Down Expand Up @@ -116,6 +130,13 @@
"PlaceMarkersAlignmentInfo",
"PlaceMarkersUsfmUpdateBlockHandler",
"parse_usfm",
"QuoteConventionChangingUsfmUpdateBlockHandler",
"QuotationMarkUpdateResolutionSettings",
"QuotationMarkUpdateStrategy",
"QuotationMarkUpdateFirstPass",
"QuotationMarkDenormalizationFirstPass",
"QuotationMarkDenormalizationUsfmUpdateBlockHandler",
"QuotationMarkUpdateSettings",
"RtlReferenceOrder",
"ScriptureElement",
"ScriptureRef",
Expand All @@ -135,6 +156,7 @@
"UpdateUsfmMarkerBehavior",
"UpdateUsfmParserHandler",
"UpdateUsfmTextBehavior",
"UpdateUsfmRow",
"UsfmAttribute",
"UsfmElementType",
"UsfmFileText",
Expand Down
120 changes: 120 additions & 0 deletions machine/corpora/fallback_quotation_mark_resolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
from typing import Generator, Optional, Set

from .punctuation_analysis.quotation_mark_direction import QuotationMarkDirection
from .punctuation_analysis.quotation_mark_metadata import QuotationMarkMetadata
from .punctuation_analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue
from .punctuation_analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings
from .punctuation_analysis.quotation_mark_resolver import QuotationMarkResolver
from .punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch


class FallbackQuotationMarkResolver(QuotationMarkResolver):

def __init__(self, settings: QuotationMarkResolutionSettings):
self._settings: QuotationMarkResolutionSettings = settings
self._last_quotation_mark: Optional[QuotationMarkMetadata] = None
self._issues: Set[QuotationMarkResolutionIssue] = set()

def reset(self) -> None:
self._last_quotation_mark = None
self._issues = set()

def resolve_quotation_marks(
self, quotation_mark_matches: list[QuotationMarkStringMatch]
) -> Generator[QuotationMarkMetadata, None, None]:
for quotation_mark_match in quotation_mark_matches:
yield from self._resolve_quotation_mark(quotation_mark_match)

def _resolve_quotation_mark(
self,
quotation_mark_match: QuotationMarkStringMatch,
) -> Generator[QuotationMarkMetadata, None, None]:
if self._is_opening_quotation_mark(quotation_mark_match):
quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_opening_mark(quotation_mark_match)
if quotation_mark is not None:
yield quotation_mark
else:
self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK)
elif self._is_closing_quotation_mark(quotation_mark_match):
quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_closing_mark(quotation_mark_match)
if quotation_mark is not None:
yield quotation_mark
else:
self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK)
else:
self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK)

def _is_opening_quotation_mark(
self,
match: QuotationMarkStringMatch,
) -> bool:

if self._settings.is_valid_opening_quotation_mark(match) and self._settings.is_valid_closing_quotation_mark(
match
):
return (
match.is_at_start_of_segment()
or match.has_leading_whitespace()
or self._does_most_recent_opening_mark_immediately_precede(match)
or match.has_quote_introducer_in_leading_substring()
) and not (match.has_trailing_whitespace() or match.has_trailing_punctuation())
elif self._settings.is_valid_opening_quotation_mark(match):
return True

return False

def _does_most_recent_opening_mark_immediately_precede(
self,
match: QuotationMarkStringMatch,
) -> bool:
if (
self._last_quotation_mark is None
or self._last_quotation_mark.direction is not QuotationMarkDirection.OPENING
):
return False

return (
self._last_quotation_mark.text_segment == match.text_segment
and self._last_quotation_mark.end_index == match.start_index
)

def _is_closing_quotation_mark(
self,
match: QuotationMarkStringMatch,
) -> bool:

if self._settings.is_valid_opening_quotation_mark(match) and self._settings.is_valid_closing_quotation_mark(
match
):
return (
match.has_trailing_whitespace() or match.has_trailing_punctuation() or match.is_at_end_of_segment()
) and not match.has_leading_whitespace()
elif self._settings.is_valid_closing_quotation_mark(match):
return True

return False

def _resolve_opening_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> Optional[QuotationMarkMetadata]:
possible_depths: Set[int] = self._settings.get_possible_depths(
quotation_mark_match.quotation_mark, QuotationMarkDirection.OPENING
)
if len(possible_depths) == 0:
return None

quotation_mark = quotation_mark_match.resolve(min(possible_depths), QuotationMarkDirection.OPENING)
self._last_quotation_mark = quotation_mark
return quotation_mark

def _resolve_closing_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> Optional[QuotationMarkMetadata]:
possible_depths: Set[int] = self._settings.get_possible_depths(
quotation_mark_match.quotation_mark, QuotationMarkDirection.CLOSING
)
if len(possible_depths) == 0:
return None

quotation_mark = quotation_mark_match.resolve(min(possible_depths), QuotationMarkDirection.CLOSING)
self._last_quotation_mark = quotation_mark
return quotation_mark

def get_issues(self) -> Set[QuotationMarkResolutionIssue]:
return self._issues
12 changes: 8 additions & 4 deletions machine/corpora/paratext_project_text_updater_base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from abc import ABC, abstractmethod
from typing import BinaryIO, Iterable, Optional, Sequence, Tuple, Union
from typing import BinaryIO, Iterable, Optional, Sequence, Union

from ..utils.typeshed import StrPath
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .scripture_ref import ScriptureRef
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior
from .update_usfm_parser_handler import (
UpdateUsfmMarkerBehavior,
UpdateUsfmParserHandler,
UpdateUsfmRow,
UpdateUsfmTextBehavior,
)
from .usfm_parser import parse_usfm
from .usfm_update_block_handler import UsfmUpdateBlockHandler

Expand All @@ -20,7 +24,7 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
def update_usfm(
self,
book_id: str,
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
rows: Optional[Sequence[UpdateUsfmRow]] = None,
full_name: Optional[str] = None,
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
Expand Down
46 changes: 28 additions & 18 deletions machine/corpora/place_markers_usfm_update_block_handler.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,50 @@
from __future__ import annotations

from typing import Iterable, List, TypedDict
from typing import List, TypedDict, cast

from ..translation.word_alignment_matrix import WordAlignmentMatrix
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior
from .usfm_token import UsfmToken, UsfmTokenType
from .usfm_update_block import UsfmUpdateBlock
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
from .usfm_update_block_handler import UsfmUpdateBlockHandler

PLACE_MARKERS_ALIGNMENT_INFO_KEY = "alignment_info"


class PlaceMarkersAlignmentInfo(TypedDict):
refs: List[str]
source_tokens: List[str]
translation_tokens: List[str]
alignment: WordAlignmentMatrix
paragraph_behavior: UpdateUsfmMarkerBehavior
style_behavior: UpdateUsfmMarkerBehavior


class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):

def __init__(self, align_info: Iterable[PlaceMarkersAlignmentInfo]) -> None:
self._align_info = {info["refs"][0]: info for info in align_info}

def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
ref = str(block.refs[0])
elements = list(block.elements)

# Nothing to do if there are no markers to place or no alignment to use
if PLACE_MARKERS_ALIGNMENT_INFO_KEY not in block.metadata:
return block

alignment_info = cast(PlaceMarkersAlignmentInfo, block.metadata[PLACE_MARKERS_ALIGNMENT_INFO_KEY])
if (
len(elements) == 0
or ref not in self._align_info.keys()
or self._align_info[ref]["alignment"].row_count == 0
or self._align_info[ref]["alignment"].column_count == 0
or alignment_info["alignment"].row_count == 0
or alignment_info["alignment"].column_count == 0
or not any(
(
e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]
and not e.marked_for_removal
and len(e.tokens) == 1
(
e.type == UsfmUpdateBlockElementType.PARAGRAPH
and alignment_info["paragraph_behavior"] == UpdateUsfmMarkerBehavior.PRESERVE
and len(e.tokens) == 1
)
or (
e.type == UsfmUpdateBlockElementType.STYLE
and alignment_info["style_behavior"] == UpdateUsfmMarkerBehavior.PRESERVE
)
)
for e in elements
)
Expand Down Expand Up @@ -65,8 +74,8 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
):
eob_empty_paras = False

src_toks = self._align_info[ref]["source_tokens"]
trg_toks = self._align_info[ref]["translation_tokens"]
src_toks: List[str] = alignment_info["source_tokens"]
trg_toks: List[str] = alignment_info["translation_tokens"]
src_tok_idx = 0

src_sent = ""
Expand All @@ -92,7 +101,10 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
else:
trg_sent += element.tokens[0].to_usfm()

if element.marked_for_removal:
if element.marked_for_removal or (
element.type == UsfmUpdateBlockElementType.PARAGRAPH
and alignment_info["paragraph_behavior"] == UpdateUsfmMarkerBehavior.STRIP
):
ignored_elements.append(element)
elif element.type == UsfmUpdateBlockElementType.EMBED:
embed_elements.append(element)
Expand All @@ -112,9 +124,7 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
# Predict marker placements and get insertion order
to_insert = []
for element, adj_src_tok in zip(to_place, adj_src_toks):
adj_trg_tok = self._predict_marker_location(
self._align_info[ref]["alignment"], adj_src_tok, src_toks, trg_toks
)
adj_trg_tok = self._predict_marker_location(alignment_info["alignment"], adj_src_tok, src_toks, trg_toks)

if (
adj_trg_tok > 0
Expand Down
68 changes: 68 additions & 0 deletions machine/corpora/punctuation_analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from .chapter import Chapter
from .depth_based_quotation_mark_resolver import (
DepthBasedQuotationMarkResolver,
QuotationMarkCategorizer,
QuotationMarkResolverState,
QuoteContinuerState,
QuoteContinuerStyle,
)
from .preliminary_quotation_mark_analyzer import (
ApostropheProportionStatistics,
PreliminaryApostropheAnalyzer,
PreliminaryQuotationMarkAnalyzer,
QuotationMarkGrouper,
QuotationMarkSequences,
QuotationMarkWordPositions,
)
from .quotation_mark_direction import QuotationMarkDirection
from .quotation_mark_finder import QuotationMarkFinder
from .quotation_mark_metadata import QuotationMarkMetadata
from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue
from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings
from .quotation_mark_resolver import QuotationMarkResolver
from .quotation_mark_string_match import QuotationMarkStringMatch
from .quotation_mark_tabulator import QuotationMarkCounts, QuotationMarkTabulator
from .quote_convention import QuoteConvention, SingleLevelQuoteConvention
from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
from .quote_convention_set import QuoteConventionSet
from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
from .text_segment import TextSegment
from .usfm_marker_type import UsfmMarkerType
from .usfm_structure_extractor import UsfmStructureExtractor
from .verse import Verse

__all__ = [
"ApostropheProportionStatistics",
"Chapter",
"DepthBasedQuotationMarkResolver",
"PreliminaryApostropheAnalyzer",
"PreliminaryQuotationMarkAnalyzer",
"SingleLevelQuoteConvention",
"QuoteContinuerState",
"QuoteContinuerStyle",
"QuotationMarkCategorizer",
"QuotationMarkCounts",
"QuotationMarkDirection",
"QuotationMarkGrouper",
"QuotationMarkMetadata",
"QuotationMarkResolverState",
"QuotationMarkSequences",
"QuotationMarkStringMatch",
"QuotationMarkWordPositions",
"QuoteConvention",
"QuoteConventionAnalysis",
"QuoteConventionDetectionResolutionSettings",
"QuotationMarkFinder",
"QuotationMarkResolutionIssue",
"QuotationMarkResolutionSettings",
"QuotationMarkResolver",
"QuotationMarkTabulator",
"QuoteConventionDetector",
"QuoteConventionSet",
"STANDARD_QUOTE_CONVENTIONS",
"TextSegment",
"UsfmMarkerType",
"UsfmStructureExtractor",
"Verse",
]
8 changes: 8 additions & 0 deletions machine/corpora/punctuation_analysis/chapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from dataclasses import dataclass

from .verse import Verse


@dataclass(frozen=True)
class Chapter:
verses: list[Verse]
Loading