sillsdev · benjaminking · Apr 2, 2025 · Apr 3, 2025 · Apr 8, 2025 · Apr 11, 2025
diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
@@ -7,6 +7,7 @@
 from .dbl_bundle_text_corpus import DblBundleTextCorpus
 from .dictionary_alignment_corpus import DictionaryAlignmentCorpus
 from .dictionary_text_corpus import DictionaryTextCorpus
+from .fallback_quotation_mark_resolver import FallbackQuotationMarkResolver
 from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
 from .file_paratext_project_text_updater import FileParatextProjectTextUpdater
 from .flatten import flatten
@@ -24,6 +25,13 @@
 from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
 from .paratext_text_corpus import ParatextTextCorpus
 from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
+from .quotation_mark_denormalization_first_pass import QuotationMarkDenormalizationFirstPass
+from .quotation_mark_denormalization_usfm_update_block_handler import QuotationMarkDenormalizationUsfmUpdateBlockHandler
+from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass
+from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings
+from .quotation_mark_update_settings import QuotationMarkUpdateSettings
+from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy
+from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler
 from .scripture_element import ScriptureElement
 from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
 from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
@@ -51,7 +59,12 @@
     normalize,
     unescape_spaces,
 )
-from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior
+from .update_usfm_parser_handler import (
+    UpdateUsfmMarkerBehavior,
+    UpdateUsfmParserHandler,
+    UpdateUsfmRow,
+    UpdateUsfmTextBehavior,
+)
 from .usfm_file_text import UsfmFileText
 from .usfm_file_text_corpus import UsfmFileTextCorpus
 from .usfm_memory_text import UsfmMemoryText
@@ -81,6 +94,7 @@
     "AlignmentCollection",
     "AlignmentCorpus",
     "AlignmentRow",
+    "FallbackQuotationMarkResolver",
     "batch",
     "Corpus",
     "create_versification_ref_corpus",
@@ -116,6 +130,13 @@
     "PlaceMarkersAlignmentInfo",
     "PlaceMarkersUsfmUpdateBlockHandler",
     "parse_usfm",
+    "QuoteConventionChangingUsfmUpdateBlockHandler",
+    "QuotationMarkUpdateResolutionSettings",
+    "QuotationMarkUpdateStrategy",
+    "QuotationMarkUpdateFirstPass",
+    "QuotationMarkDenormalizationFirstPass",
+    "QuotationMarkDenormalizationUsfmUpdateBlockHandler",
+    "QuotationMarkUpdateSettings",
     "RtlReferenceOrder",
     "ScriptureElement",
     "ScriptureRef",
@@ -135,6 +156,7 @@
     "UpdateUsfmMarkerBehavior",
     "UpdateUsfmParserHandler",
     "UpdateUsfmTextBehavior",
+    "UpdateUsfmRow",
     "UsfmAttribute",
     "UsfmElementType",
     "UsfmFileText",

diff --git a/machine/corpora/fallback_quotation_mark_resolver.py b/machine/corpora/fallback_quotation_mark_resolver.py
@@ -0,0 +1,120 @@
+from typing import Generator, Optional, Set
+
+from .punctuation_analysis.quotation_mark_direction import QuotationMarkDirection
+from .punctuation_analysis.quotation_mark_metadata import QuotationMarkMetadata
+from .punctuation_analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue
+from .punctuation_analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings
+from .punctuation_analysis.quotation_mark_resolver import QuotationMarkResolver
+from .punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch
+
+
+class FallbackQuotationMarkResolver(QuotationMarkResolver):
+
+    def __init__(self, settings: QuotationMarkResolutionSettings):
+        self._settings: QuotationMarkResolutionSettings = settings
+        self._last_quotation_mark: Optional[QuotationMarkMetadata] = None
+        self._issues: Set[QuotationMarkResolutionIssue] = set()
+
+    def reset(self) -> None:
+        self._last_quotation_mark = None
+        self._issues = set()
+
+    def resolve_quotation_marks(
+        self, quotation_mark_matches: list[QuotationMarkStringMatch]
+    ) -> Generator[QuotationMarkMetadata, None, None]:
+        for quotation_mark_match in quotation_mark_matches:
+            yield from self._resolve_quotation_mark(quotation_mark_match)
+
+    def _resolve_quotation_mark(
+        self,
+        quotation_mark_match: QuotationMarkStringMatch,
+    ) -> Generator[QuotationMarkMetadata, None, None]:
+        if self._is_opening_quotation_mark(quotation_mark_match):
+            quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_opening_mark(quotation_mark_match)
+            if quotation_mark is not None:
+                yield quotation_mark
+            else:
+                self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK)
+        elif self._is_closing_quotation_mark(quotation_mark_match):
+            quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_closing_mark(quotation_mark_match)
+            if quotation_mark is not None:
+                yield quotation_mark
+            else:
+                self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK)
+        else:
+            self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK)
+
+    def _is_opening_quotation_mark(
+        self,
+        match: QuotationMarkStringMatch,
+    ) -> bool:
+
+        if self._settings.is_valid_opening_quotation_mark(match) and self._settings.is_valid_closing_quotation_mark(
+            match
+        ):
+            return (
+                match.is_at_start_of_segment()
+                or match.has_leading_whitespace()
+                or self._does_most_recent_opening_mark_immediately_precede(match)
+                or match.has_quote_introducer_in_leading_substring()
+            ) and not (match.has_trailing_whitespace() or match.has_trailing_punctuation())
+        elif self._settings.is_valid_opening_quotation_mark(match):
+            return True
+
+        return False
+
+    def _does_most_recent_opening_mark_immediately_precede(
+        self,
+        match: QuotationMarkStringMatch,
+    ) -> bool:
+        if (
+            self._last_quotation_mark is None
+            or self._last_quotation_mark.direction is not QuotationMarkDirection.OPENING
+        ):
+            return False
+
+        return (
+            self._last_quotation_mark.text_segment == match.text_segment
+            and self._last_quotation_mark.end_index == match.start_index
+        )
+
+    def _is_closing_quotation_mark(
+        self,
+        match: QuotationMarkStringMatch,
+    ) -> bool:
+
+        if self._settings.is_valid_opening_quotation_mark(match) and self._settings.is_valid_closing_quotation_mark(
+            match
+        ):
+            return (
+                match.has_trailing_whitespace() or match.has_trailing_punctuation() or match.is_at_end_of_segment()
+            ) and not match.has_leading_whitespace()
+        elif self._settings.is_valid_closing_quotation_mark(match):
+            return True
+
+        return False
+
+    def _resolve_opening_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> Optional[QuotationMarkMetadata]:
+        possible_depths: Set[int] = self._settings.get_possible_depths(
+            quotation_mark_match.quotation_mark, QuotationMarkDirection.OPENING
+        )
+        if len(possible_depths) == 0:
+            return None
+
+        quotation_mark = quotation_mark_match.resolve(min(possible_depths), QuotationMarkDirection.OPENING)
+        self._last_quotation_mark = quotation_mark
+        return quotation_mark
+
+    def _resolve_closing_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> Optional[QuotationMarkMetadata]:
+        possible_depths: Set[int] = self._settings.get_possible_depths(
+            quotation_mark_match.quotation_mark, QuotationMarkDirection.CLOSING
+        )
+        if len(possible_depths) == 0:
+            return None
+
+        quotation_mark = quotation_mark_match.resolve(min(possible_depths), QuotationMarkDirection.CLOSING)
+        self._last_quotation_mark = quotation_mark
+        return quotation_mark
+
+    def get_issues(self) -> Set[QuotationMarkResolutionIssue]:
+        return self._issues
diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
@@ -1,11 +1,15 @@
 from abc import ABC, abstractmethod
-from typing import BinaryIO, Iterable, Optional, Sequence, Tuple, Union
+from typing import BinaryIO, Iterable, Optional, Sequence, Union
 
 from ..utils.typeshed import StrPath
 from .paratext_project_settings import ParatextProjectSettings
 from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
-from .scripture_ref import ScriptureRef
-from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior
+from .update_usfm_parser_handler import (
+    UpdateUsfmMarkerBehavior,
+    UpdateUsfmParserHandler,
+    UpdateUsfmRow,
+    UpdateUsfmTextBehavior,
+)
 from .usfm_parser import parse_usfm
 from .usfm_update_block_handler import UsfmUpdateBlockHandler
 
@@ -20,7 +24,7 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
     def update_usfm(
         self,
         book_id: str,
-        rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
+        rows: Optional[Sequence[UpdateUsfmRow]] = None,
         full_name: Optional[str] = None,
         text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
         paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,

diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py
@@ -1,41 +1,50 @@
 from __future__ import annotations
 
-from typing import Iterable, List, TypedDict
+from typing import List, TypedDict, cast
 
 from ..translation.word_alignment_matrix import WordAlignmentMatrix
+from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior
 from .usfm_token import UsfmToken, UsfmTokenType
 from .usfm_update_block import UsfmUpdateBlock
 from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
 from .usfm_update_block_handler import UsfmUpdateBlockHandler
 
+PLACE_MARKERS_ALIGNMENT_INFO_KEY = "alignment_info"
+
 
 class PlaceMarkersAlignmentInfo(TypedDict):
-    refs: List[str]
     source_tokens: List[str]
     translation_tokens: List[str]
     alignment: WordAlignmentMatrix
+    paragraph_behavior: UpdateUsfmMarkerBehavior
+    style_behavior: UpdateUsfmMarkerBehavior
 
 
 class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):
 
-    def __init__(self, align_info: Iterable[PlaceMarkersAlignmentInfo]) -> None:
-        self._align_info = {info["refs"][0]: info for info in align_info}
-
     def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
-        ref = str(block.refs[0])
         elements = list(block.elements)
 
         # Nothing to do if there are no markers to place or no alignment to use
+        if PLACE_MARKERS_ALIGNMENT_INFO_KEY not in block.metadata:
+            return block
+
+        alignment_info = cast(PlaceMarkersAlignmentInfo, block.metadata[PLACE_MARKERS_ALIGNMENT_INFO_KEY])
         if (
             len(elements) == 0
-            or ref not in self._align_info.keys()
-            or self._align_info[ref]["alignment"].row_count == 0
-            or self._align_info[ref]["alignment"].column_count == 0
+            or alignment_info["alignment"].row_count == 0
+            or alignment_info["alignment"].column_count == 0
             or not any(
                 (
-                    e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]
-                    and not e.marked_for_removal
-                    and len(e.tokens) == 1
+                    (
+                        e.type == UsfmUpdateBlockElementType.PARAGRAPH
+                        and alignment_info["paragraph_behavior"] == UpdateUsfmMarkerBehavior.PRESERVE
+                        and len(e.tokens) == 1
+                    )
+                    or (
+                        e.type == UsfmUpdateBlockElementType.STYLE
+                        and alignment_info["style_behavior"] == UpdateUsfmMarkerBehavior.PRESERVE
+                    )
                 )
                 for e in elements
             )
@@ -65,8 +74,8 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
             ):
                 eob_empty_paras = False
 
-        src_toks = self._align_info[ref]["source_tokens"]
-        trg_toks = self._align_info[ref]["translation_tokens"]
+        src_toks: List[str] = alignment_info["source_tokens"]
+        trg_toks: List[str] = alignment_info["translation_tokens"]
         src_tok_idx = 0
 
         src_sent = ""
@@ -92,7 +101,10 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
                 else:
                     trg_sent += element.tokens[0].to_usfm()
 
-            if element.marked_for_removal:
+            if element.marked_for_removal or (
+                element.type == UsfmUpdateBlockElementType.PARAGRAPH
+                and alignment_info["paragraph_behavior"] == UpdateUsfmMarkerBehavior.STRIP
+            ):
                 ignored_elements.append(element)
             elif element.type == UsfmUpdateBlockElementType.EMBED:
                 embed_elements.append(element)
@@ -112,9 +124,7 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
         # Predict marker placements and get insertion order
         to_insert = []
         for element, adj_src_tok in zip(to_place, adj_src_toks):
-            adj_trg_tok = self._predict_marker_location(
-                self._align_info[ref]["alignment"], adj_src_tok, src_toks, trg_toks
-            )
+            adj_trg_tok = self._predict_marker_location(alignment_info["alignment"], adj_src_tok, src_toks, trg_toks)
 
             if (
                 adj_trg_tok > 0

diff --git a/machine/corpora/punctuation_analysis/__init__.py b/machine/corpora/punctuation_analysis/__init__.py
@@ -0,0 +1,68 @@
+from .chapter import Chapter
+from .depth_based_quotation_mark_resolver import (
+    DepthBasedQuotationMarkResolver,
+    QuotationMarkCategorizer,
+    QuotationMarkResolverState,
+    QuoteContinuerState,
+    QuoteContinuerStyle,
+)
+from .preliminary_quotation_mark_analyzer import (
+    ApostropheProportionStatistics,
+    PreliminaryApostropheAnalyzer,
+    PreliminaryQuotationMarkAnalyzer,
+    QuotationMarkGrouper,
+    QuotationMarkSequences,
+    QuotationMarkWordPositions,
+)
+from .quotation_mark_direction import QuotationMarkDirection
+from .quotation_mark_finder import QuotationMarkFinder
+from .quotation_mark_metadata import QuotationMarkMetadata
+from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue
+from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings
+from .quotation_mark_resolver import QuotationMarkResolver
+from .quotation_mark_string_match import QuotationMarkStringMatch
+from .quotation_mark_tabulator import QuotationMarkCounts, QuotationMarkTabulator
+from .quote_convention import QuoteConvention, SingleLevelQuoteConvention
+from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
+from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
+from .quote_convention_set import QuoteConventionSet
+from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
+from .text_segment import TextSegment
+from .usfm_marker_type import UsfmMarkerType
+from .usfm_structure_extractor import UsfmStructureExtractor
+from .verse import Verse
+
+__all__ = [
+    "ApostropheProportionStatistics",
+    "Chapter",
+    "DepthBasedQuotationMarkResolver",
+    "PreliminaryApostropheAnalyzer",
+    "PreliminaryQuotationMarkAnalyzer",
+    "SingleLevelQuoteConvention",
+    "QuoteContinuerState",
+    "QuoteContinuerStyle",
+    "QuotationMarkCategorizer",
+    "QuotationMarkCounts",
+    "QuotationMarkDirection",
+    "QuotationMarkGrouper",
+    "QuotationMarkMetadata",
+    "QuotationMarkResolverState",
+    "QuotationMarkSequences",
+    "QuotationMarkStringMatch",
+    "QuotationMarkWordPositions",
+    "QuoteConvention",
+    "QuoteConventionAnalysis",
+    "QuoteConventionDetectionResolutionSettings",
+    "QuotationMarkFinder",
+    "QuotationMarkResolutionIssue",
+    "QuotationMarkResolutionSettings",
+    "QuotationMarkResolver",
+    "QuotationMarkTabulator",
+    "QuoteConventionDetector",
+    "QuoteConventionSet",
+    "STANDARD_QUOTE_CONVENTIONS",
+    "TextSegment",
+    "UsfmMarkerType",
+    "UsfmStructureExtractor",
+    "Verse",
+]
diff --git a/machine/corpora/punctuation_analysis/chapter.py b/machine/corpora/punctuation_analysis/chapter.py
@@ -0,0 +1,8 @@
+from dataclasses import dataclass
+
+from .verse import Verse
+
+
+@dataclass(frozen=True)
+class Chapter:
+    verses: list[Verse]