Skip to content

Commit b1d1f28

Browse files
author
Ben King
committed
Refactoring to allow arbitrary quote convention changes + more unit tests
1 parent 7451a72 commit b1d1f28

17 files changed

+1444
-944
lines changed

machine/corpora/__init__.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
from .alignment_collection import AlignmentCollection
33
from .alignment_corpus import AlignmentCorpus
44
from .alignment_row import AlignmentRow
5-
from .basic_quotation_mark_resolver import BasicQuotationMarkResolver
65
from .corpora_utils import batch
76
from .corpus import Corpus
87
from .dbl_bundle_text_corpus import DblBundleTextCorpus
98
from .dictionary_alignment_corpus import DictionaryAlignmentCorpus
109
from .dictionary_text_corpus import DictionaryTextCorpus
10+
from .fallback_quotation_mark_resolver import FallbackQuotationMarkResolver
1111
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
1212
from .file_paratext_project_text_updater import FileParatextProjectTextUpdater
1313
from .flatten import flatten
@@ -25,11 +25,13 @@
2525
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
2626
from .paratext_text_corpus import ParatextTextCorpus
2727
from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
28-
from .quotation_denormalization_action import QuotationDenormalizationAction
2928
from .quotation_denormalization_first_pass import QuotationDenormalizationFirstPass
30-
from .quotation_denormalization_resolution_settings import QuotationDenormalizationResolutionSettings
31-
from .quotation_denormalization_settings import QuotationDenormalizationSettings
3229
from .quotation_denormalization_usfm_update_block_handler import QuotationDenormalizationUsfmUpdateBlockHandler
30+
from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass
31+
from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings
32+
from .quotation_mark_update_settings import QuotationMarkUpdateSettings
33+
from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy
34+
from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler
3335
from .scripture_element import ScriptureElement
3436
from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
3537
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
@@ -87,7 +89,7 @@
8789
"AlignmentCollection",
8890
"AlignmentCorpus",
8991
"AlignmentRow",
90-
"BasicQuotationMarkResolver",
92+
"FallbackQuotationMarkResolver",
9193
"batch",
9294
"Corpus",
9395
"create_versification_ref_corpus",
@@ -123,11 +125,13 @@
123125
"PlaceMarkersAlignmentInfo",
124126
"PlaceMarkersUsfmUpdateBlockHandler",
125127
"parse_usfm",
126-
"QuotationDenormalizationAction",
128+
"QuoteConventionChangingUsfmUpdateBlockHandler",
129+
"QuotationMarkUpdateResolutionSettings",
130+
"QuotationMarkUpdateStrategy",
131+
"QuotationMarkUpdateFirstPass",
127132
"QuotationDenormalizationFirstPass",
128133
"QuotationDenormalizationUsfmUpdateBlockHandler",
129-
"QuotationDenormalizationResolutionSettings",
130-
"QuotationDenormalizationSettings",
134+
"QuotationMarkUpdateSettings",
131135
"RtlReferenceOrder",
132136
"ScriptureElement",
133137
"ScriptureRef",

machine/corpora/analysis/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings
88
from .quotation_mark_resolver import QuotationMarkResolver
99
from .quotation_mark_string_match import QuotationMarkStringMatch
10+
from .quotation_mark_tabulator import QuotationMarkCounts, QuotationMarkTabulator
1011
from .quote_convention import QuoteConvention, SingleLevelQuoteConvention
1112
from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
1213
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
@@ -20,6 +21,7 @@
2021
"Chapter",
2122
"DepthBasedQuotationMarkResolver",
2223
"SingleLevelQuoteConvention",
24+
"QuotationMarkCounts",
2325
"QuotationMarkDirection",
2426
"QuotationMarkMetadata",
2527
"QuotationMarkStringMatch",
@@ -30,6 +32,7 @@
3032
"QuotationMarkResolutionIssue",
3133
"QuotationMarkResolutionSettings",
3234
"QuotationMarkResolver",
35+
"QuotationMarkTabulator",
3336
"QuoteConventionDetector",
3437
"QuoteConventionSet",
3538
"TextSegment",

machine/corpora/analysis/quote_convention.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,20 @@ def __init__(self, name: str, levels: list[SingleLevelQuoteConvention]):
4848
self.name = name
4949
self.levels = levels
5050

51+
def __eq__(self, value):
52+
if not isinstance(value, QuoteConvention):
53+
return False
54+
if self.name != value.name:
55+
return False
56+
if len(self.levels) != len(value.levels):
57+
return False
58+
for level, other_level in zip(self.levels, value.levels):
59+
if level.get_opening_quote() != other_level.get_opening_quote():
60+
return False
61+
if level.get_closing_quote() != other_level.get_closing_quote():
62+
return False
63+
return True
64+
5165
def get_name(self) -> str:
5266
return self.name
5367

machine/corpora/basic_quotation_mark_resolver.py renamed to machine/corpora/fallback_quotation_mark_resolver.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from .analysis.quotation_mark_string_match import QuotationMarkStringMatch
99

1010

11-
class BasicQuotationMarkResolver(QuotationMarkResolver):
11+
class FallbackQuotationMarkResolver(QuotationMarkResolver):
1212

1313
def __init__(self, settings: QuotationMarkResolutionSettings):
1414
self._settings: QuotationMarkResolutionSettings = settings
Lines changed: 3 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,81 +1,8 @@
1-
from typing import Dict, List, Set
2-
3-
from .analysis.chapter import Chapter
4-
from .analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver
5-
from .analysis.quotation_mark_finder import QuotationMarkFinder
6-
from .analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue
7-
from .analysis.quotation_mark_resolver import QuotationMarkResolver
8-
from .analysis.quotation_mark_string_match import QuotationMarkStringMatch
91
from .analysis.quote_convention import QuoteConvention
10-
from .analysis.quote_convention_set import QuoteConventionSet
11-
from .analysis.usfm_structure_extractor import UsfmStructureExtractor
12-
from .quotation_denormalization_action import QuotationDenormalizationAction
13-
from .quotation_denormalization_resolution_settings import QuotationDenormalizationResolutionSettings
2+
from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass
143

154

16-
class QuotationDenormalizationFirstPass(UsfmStructureExtractor):
5+
class QuotationDenormalizationFirstPass(QuotationMarkUpdateFirstPass):
176

187
def __init__(self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention):
19-
super().__init__()
20-
self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder(
21-
QuoteConventionSet([source_quote_convention.normalize()])
22-
)
23-
self._quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver(
24-
QuotationDenormalizationResolutionSettings(source_quote_convention, target_quote_convention)
25-
)
26-
self._will_basic_denormalization_work: bool = self._check_whether_basic_denormalization_will_work(
27-
source_quote_convention, target_quote_convention
28-
)
29-
30-
def _check_whether_basic_denormalization_will_work(
31-
self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention
32-
) -> bool:
33-
normalized_source_quote_convention: QuoteConvention = source_quote_convention.normalize()
34-
target_marks_by_normalized_source_marks: Dict[str, Set[str]] = {}
35-
for level in range(1, normalized_source_quote_convention.get_num_levels() + 1):
36-
normalized_opening_quotation_mark = normalized_source_quote_convention.get_opening_quote_at_level(level)
37-
if normalized_opening_quotation_mark not in target_marks_by_normalized_source_marks:
38-
target_marks_by_normalized_source_marks[normalized_opening_quotation_mark] = set()
39-
if level <= target_quote_convention.get_num_levels():
40-
target_marks_by_normalized_source_marks[normalized_opening_quotation_mark].add(
41-
target_quote_convention.get_closing_quote_at_level(level)
42-
)
43-
44-
for normalized_source_mark in target_marks_by_normalized_source_marks:
45-
if len(target_marks_by_normalized_source_marks[normalized_source_mark]) > 1:
46-
return False
47-
return True
48-
49-
def get_best_actions_by_chapter(self) -> List[QuotationDenormalizationAction]:
50-
best_actions_by_chapter: List[QuotationDenormalizationAction] = []
51-
52-
for chapter in self.get_chapters():
53-
best_actions_by_chapter.append(self._find_best_action_for_chapter(chapter))
54-
55-
return best_actions_by_chapter
56-
57-
def _find_best_action_for_chapter(self, chapter: Chapter) -> QuotationDenormalizationAction:
58-
quotation_mark_matches: List[QuotationMarkStringMatch] = (
59-
self._quotation_mark_finder.find_all_potential_quotation_marks_in_chapter(chapter)
60-
)
61-
62-
self._quotation_mark_resolver.reset()
63-
64-
# use list() to force evaluation of the generator
65-
list(self._quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches))
66-
67-
return self._choose_best_action_based_on_observed_issues(self._quotation_mark_resolver.get_issues())
68-
69-
def _choose_best_action_based_on_observed_issues(self, issues) -> QuotationDenormalizationAction:
70-
if QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK in issues:
71-
return QuotationDenormalizationAction.SKIP
72-
73-
if (
74-
QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK in issues
75-
or QuotationMarkResolutionIssue.TOO_DEEP_NESTING in issues
76-
):
77-
if self._will_basic_denormalization_work:
78-
return QuotationDenormalizationAction.APPLY_BASIC
79-
return QuotationDenormalizationAction.SKIP
80-
81-
return QuotationDenormalizationAction.APPLY_FULL
8+
super().__init__(source_quote_convention.normalize(), target_quote_convention)
Lines changed: 5 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -1,150 +1,14 @@
1-
from typing import List, Union
2-
3-
from .analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver
4-
from .analysis.quotation_mark_finder import QuotationMarkFinder
5-
from .analysis.quotation_mark_resolver import QuotationMarkResolver
6-
from .analysis.quotation_mark_string_match import QuotationMarkStringMatch
71
from .analysis.quote_convention import QuoteConvention
8-
from .analysis.quote_convention_set import QuoteConventionSet
9-
from .analysis.text_segment import TextSegment
10-
from .analysis.usfm_marker_type import UsfmMarkerType
11-
from .basic_quotation_mark_resolver import BasicQuotationMarkResolver
12-
from .quotation_denormalization_action import QuotationDenormalizationAction
13-
from .quotation_denormalization_resolution_settings import QuotationDenormalizationResolutionSettings
14-
from .quotation_denormalization_settings import QuotationDenormalizationSettings
15-
from .usfm_token import UsfmToken, UsfmTokenType
16-
from .usfm_update_block import UsfmUpdateBlock
17-
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
18-
from .usfm_update_block_handler import UsfmUpdateBlockHandler
2+
from .quotation_mark_update_settings import QuotationMarkUpdateSettings
3+
from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler
194

205

21-
class QuotationDenormalizationUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):
6+
class QuotationDenormalizationUsfmUpdateBlockHandler(QuoteConventionChangingUsfmUpdateBlockHandler):
227

238
def __init__(
249
self,
2510
source_quote_convention: QuoteConvention,
2611
target_quote_convention: QuoteConvention,
27-
settings: QuotationDenormalizationSettings = QuotationDenormalizationSettings(),
12+
settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(),
2813
):
29-
super().__init__()
30-
self._source_quote_convention: QuoteConvention = source_quote_convention
31-
self._target_quote_convention: QuoteConvention = target_quote_convention
32-
self._settings: QuotationDenormalizationSettings = settings
33-
34-
self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder(
35-
QuoteConventionSet([self._source_quote_convention.normalize()])
36-
)
37-
self._next_scripture_text_segment_builder: TextSegment.Builder = TextSegment.Builder()
38-
39-
resolution_settings = QuotationDenormalizationResolutionSettings(
40-
self._source_quote_convention, self._target_quote_convention
41-
)
42-
43-
# Each embed represents a separate context for quotation marks
44-
# (i.e. you can't open a quote in one context and close it in another)
45-
# so we need to keep track of the verse and embed contexts separately.
46-
self._verse_text_quotation_mark_resolver: DepthBasedQuotationMarkResolver = DepthBasedQuotationMarkResolver(
47-
resolution_settings
48-
)
49-
self._embed_quotation_mark_resolver: DepthBasedQuotationMarkResolver = DepthBasedQuotationMarkResolver(
50-
resolution_settings
51-
)
52-
self._simple_quotation_mark_resolver: BasicQuotationMarkResolver = BasicQuotationMarkResolver(
53-
resolution_settings
54-
)
55-
self._current_denormalization_action = QuotationDenormalizationAction.APPLY_FULL
56-
self._current_chapter_number: int = 0
57-
self._current_verse_number: int = 0
58-
59-
def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
60-
self._check_for_chapter_change(block)
61-
self._check_for_verse_change(block)
62-
if self._current_denormalization_action is QuotationDenormalizationAction.SKIP:
63-
return block
64-
if self._current_denormalization_action is QuotationDenormalizationAction.APPLY_BASIC:
65-
return self._apply_simple_denormalization(block)
66-
return self._apply_full_denormalization(block)
67-
68-
def _apply_simple_denormalization(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
69-
for element in block._elements:
70-
self._process_scripture_element(element, self._simple_quotation_mark_resolver)
71-
return block
72-
73-
def _apply_full_denormalization(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
74-
for element in block._elements:
75-
if element.type == UsfmUpdateBlockElementType.EMBED:
76-
self._embed_quotation_mark_resolver.reset()
77-
self._process_scripture_element(element, self._embed_quotation_mark_resolver)
78-
else:
79-
self._process_scripture_element(element, self._verse_text_quotation_mark_resolver)
80-
81-
return block
82-
83-
def _process_scripture_element(
84-
self, element: UsfmUpdateBlockElement, quotation_mark_resolver: QuotationMarkResolver
85-
) -> None:
86-
text_segments: List[TextSegment] = self._create_text_segments(element)
87-
quotation_mark_matches: List[QuotationMarkStringMatch] = (
88-
self._quotation_mark_finder.find_all_potential_quotation_marks_in_text_segments(text_segments)
89-
)
90-
for resolved_quotation_mark in quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches):
91-
resolved_quotation_mark.update_quotation_mark(self._target_quote_convention)
92-
93-
def _create_text_segments(self, element: UsfmUpdateBlockElement) -> List[TextSegment]:
94-
text_segments: List[TextSegment] = []
95-
for token in element.get_tokens():
96-
if token.type == UsfmTokenType.VERSE:
97-
self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker)
98-
elif token.type == UsfmTokenType.PARAGRAPH:
99-
self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ParagraphMarker)
100-
elif token.type == UsfmTokenType.CHARACTER:
101-
self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.CharacterMarker)
102-
elif token.type == UsfmTokenType.NOTE:
103-
self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.EmbedMarker)
104-
elif token.type == UsfmTokenType.TEXT:
105-
text_segment: Union[TextSegment, None] = self._create_text_segment(token)
106-
if text_segment is not None:
107-
text_segments.append(text_segment)
108-
return self._set_previous_and_next_for_segments(text_segments)
109-
110-
def _create_text_segment(self, token: UsfmToken) -> Union[TextSegment, None]:
111-
self._next_scripture_text_segment_builder.set_usfm_token(token)
112-
if token.text is not None:
113-
self._next_scripture_text_segment_builder.set_text(token.text)
114-
text_segment_to_return: TextSegment = self._next_scripture_text_segment_builder.build()
115-
self._next_scripture_text_segment_builder = TextSegment.Builder()
116-
return text_segment_to_return
117-
else:
118-
self._next_scripture_text_segment_builder = TextSegment.Builder()
119-
120-
def _set_previous_and_next_for_segments(self, text_segments: List[TextSegment]) -> List[TextSegment]:
121-
for i in range(len(text_segments)):
122-
if i > 0:
123-
text_segments[i].set_previous_segment(text_segments[i - 1])
124-
if i < len(text_segments) - 1:
125-
text_segments[i].set_next_segment(text_segments[i + 1])
126-
return text_segments
127-
128-
def _check_for_chapter_change(self, block: UsfmUpdateBlock) -> None:
129-
for scripture_ref in block.refs:
130-
if scripture_ref.chapter_num != self._current_chapter_number:
131-
self._current_chapter_number = scripture_ref.chapter_num
132-
self._start_new_chapter(self._current_chapter_number)
133-
134-
def _start_new_chapter(self, new_chapter_number: int) -> None:
135-
self._current_denormalization_action = self._settings.get_action_for_chapter(new_chapter_number)
136-
self._verse_text_quotation_mark_resolver.reset()
137-
self._next_scripture_text_segment_builder = TextSegment.Builder()
138-
self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.ChapterMarker)
139-
140-
def _check_for_verse_change(self, block: UsfmUpdateBlock) -> None:
141-
for scripture_ref in block.refs:
142-
if (
143-
scripture_ref.chapter_num == self._current_chapter_number
144-
and scripture_ref.verse_num != self._current_verse_number
145-
):
146-
self._current_verse_number = scripture_ref.verse_num
147-
self._start_new_verse(self._current_verse_number)
148-
149-
def _start_new_verse(self, new_chapter_number: int) -> None:
150-
self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VerseMarker)
14+
super().__init__(source_quote_convention.normalize(), target_quote_convention, settings)

0 commit comments

Comments
 (0)