Skip to content

Commit 0b82cf0

Browse files
committed
Preserve embeddings (fig, fm, notes and cross references)
Only translate ft text Configure preserving / stripping embedded and style markers
1 parent 3a9b17e commit 0b82cf0

10 files changed

+394
-135
lines changed

src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs

+10-2
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ public string UpdateUsfm(
2323
string bookId,
2424
IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> rows,
2525
string fullName = null,
26-
UpdateUsfmBehavior behavior = UpdateUsfmBehavior.PreferExisting
26+
UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting,
27+
UpdateUsfmIntraVerseMarkerBehavior embeddedBehavior = UpdateUsfmIntraVerseMarkerBehavior.Preserve,
28+
UpdateUsfmIntraVerseMarkerBehavior styleBehavior = UpdateUsfmIntraVerseMarkerBehavior.Strip
2729
)
2830
{
2931
string fileName = _settings.GetBookFileName(bookId);
@@ -36,7 +38,13 @@ public string UpdateUsfm(
3638
usfm = reader.ReadToEnd();
3739
}
3840

39-
var handler = new UpdateUsfmParserHandler(rows, fullName is null ? null : $"- {fullName}", behavior);
41+
var handler = new UpdateUsfmParserHandler(
42+
rows,
43+
fullName is null ? null : $"- {fullName}",
44+
textBehavior,
45+
embeddedBehavior,
46+
styleBehavior
47+
);
4048
try
4149
{
4250
UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification);

src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs

+102-21
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using System.Collections.Generic;
22
using System.Linq;
3+
using SIL.Extensions;
34
using SIL.Scripture;
45

56
namespace SIL.Machine.Corpora
@@ -9,7 +10,8 @@ public enum ScriptureTextType
910
None,
1011
NonVerse,
1112
Verse,
12-
Note
13+
Embedded,
14+
NoteText
1315
}
1416

1517
public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase
@@ -19,6 +21,9 @@ public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase
1921
private readonly Stack<ScriptureTextType> _curTextType;
2022
private bool _duplicateVerse = false;
2123

24+
private bool _inEmbedded;
25+
public bool InNoteText { get; private set; }
26+
2227
protected ScriptureRefUsfmParserHandlerBase()
2328
{
2429
_curElements = new Stack<ScriptureElement>();
@@ -59,7 +64,7 @@ string pubNumber
5964
// ignore duplicate verses
6065
_duplicateVerse = true;
6166
}
62-
else if (VerseRef.AreOverlappingVersesRanges(number, _curVerseRef.Verse))
67+
else if (VerseRef.AreOverlappingVersesRanges(verse1: number, verse2: _curVerseRef.Verse))
6368
{
6469
// merge overlapping verse ranges in to one range
6570
VerseRef verseRef = _curVerseRef.Clone();
@@ -153,20 +158,36 @@ public override void EndSidebar(UsfmParserState state, string marker, bool close
153158

154159
public override void StartNote(UsfmParserState state, string marker, string caller, string category)
155160
{
156-
if (CurrentTextType != ScriptureTextType.None && !_duplicateVerse)
161+
_inEmbedded = true;
162+
StartEmbedded(state, marker, caller, category);
163+
}
164+
165+
public override void EndNote(UsfmParserState state, string marker, bool closed)
166+
{
167+
EndNoteText(state);
168+
EndEmbedded(state, marker, null, closed);
169+
_inEmbedded = false;
170+
}
171+
172+
public virtual void StartEmbedded(UsfmParserState state, string marker, string caller, string category)
173+
{
174+
if (_curVerseRef.IsDefault)
175+
UpdateVerseRef(state.VerseRef, marker);
176+
177+
if (!_duplicateVerse)
157178
{
158179
// if we hit a note in a verse paragraph and we aren't in a verse, then start a non-verse segment
159180
CheckConvertVerseParaToNonVerse(state);
160181
NextElement(marker);
161-
StartNoteText(state);
162182
}
163183
}
164184

165-
public override void EndNote(UsfmParserState state, string marker, bool closed)
166-
{
167-
if (CurrentTextType == ScriptureTextType.Note && !_duplicateVerse)
168-
EndNoteText(state);
169-
}
185+
public virtual void EndEmbedded(
186+
UsfmParserState state,
187+
string marker,
188+
IReadOnlyList<UsfmAttribute> attributes,
189+
bool closed
190+
) { }
170191

171192
public override void Text(UsfmParserState state, string text)
172193
{
@@ -187,9 +208,37 @@ public override void StartChar(
187208
IReadOnlyList<UsfmAttribute> attributes
188209
)
189210
{
211+
if (IsEmbeddedPart(markerWithoutPlus))
212+
EndNoteText(state);
213+
190214
// if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse
191215
// segment
192216
CheckConvertVerseParaToNonVerse(state);
217+
218+
if (IsEmbeddedCharacter(markerWithoutPlus))
219+
{
220+
_inEmbedded = true;
221+
StartEmbedded(state, markerWithoutPlus, null, null);
222+
}
223+
224+
if (IsNoteText(markerWithoutPlus))
225+
{
226+
StartNoteText(state);
227+
}
228+
}
229+
230+
public override void EndChar(
231+
UsfmParserState state,
232+
string marker,
233+
IReadOnlyList<UsfmAttribute> attributes,
234+
bool closed
235+
)
236+
{
237+
if (IsEmbeddedCharacter(marker))
238+
{
239+
EndEmbedded(state, marker, attributes, closed);
240+
_inEmbedded = false;
241+
}
193242
}
194243

195244
protected virtual void StartVerseText(UsfmParserState state, IReadOnlyList<ScriptureRef> scriptureRefs) { }
@@ -200,8 +249,25 @@ protected virtual void StartNonVerseText(UsfmParserState state, ScriptureRef scr
200249

201250
protected virtual void EndNonVerseText(UsfmParserState state, ScriptureRef scriptureRef) { }
202251

252+
public virtual void StartNoteText(UsfmParserState state)
253+
{
254+
InNoteText = true;
255+
_curTextType.Push(ScriptureTextType.NoteText);
256+
StartNoteText(state, CreateNonVerseRef());
257+
}
258+
203259
protected virtual void StartNoteText(UsfmParserState state, ScriptureRef scriptureRef) { }
204260

261+
public virtual void EndNoteText(UsfmParserState state)
262+
{
263+
if (_curTextType.Count > 0 && _curTextType.Peek() == ScriptureTextType.NoteText)
264+
{
265+
EndNoteText(state, CreateNonVerseRef());
266+
_curTextType.Pop();
267+
InNoteText = false;
268+
}
269+
}
270+
205271
protected virtual void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef) { }
206272

207273
private void StartVerseText(UsfmParserState state)
@@ -227,22 +293,11 @@ private void StartNonVerseText(UsfmParserState state)
227293

228294
private void EndNonVerseText(UsfmParserState state)
229295
{
296+
EndEmbeddedElements();
230297
EndNonVerseText(state, CreateNonVerseRef());
231298
_curTextType.Pop();
232299
}
233300

234-
private void StartNoteText(UsfmParserState state)
235-
{
236-
_curTextType.Push(ScriptureTextType.Note);
237-
StartNoteText(state, CreateNonVerseRef());
238-
}
239-
240-
private void EndNoteText(UsfmParserState state)
241-
{
242-
EndNoteText(state, CreateNonVerseRef());
243-
_curTextType.Pop();
244-
}
245-
246301
private void UpdateVerseRef(VerseRef verseRef, string marker)
247302
{
248303
if (!VerseRef.AreOverlappingVersesRanges(verseRef, _curVerseRef))
@@ -270,6 +325,12 @@ private void EndParentElement()
270325
_curElements.Pop();
271326
}
272327

328+
private void EndEmbeddedElements()
329+
{
330+
if (_curElements.Count > 0 && IsEmbeddedCharacter(_curElements.Peek().Name))
331+
_curElements.Pop();
332+
}
333+
273334
private IReadOnlyList<ScriptureRef> CreateVerseRefs()
274335
{
275336
return _curVerseRef.HasMultiple
@@ -300,5 +361,25 @@ private void CheckConvertVerseParaToNonVerse(UsfmParserState state)
300361
StartNonVerseText(state);
301362
}
302363
}
364+
365+
public bool InEmbedded(string marker)
366+
{
367+
return _inEmbedded || IsEmbeddedCharacter(marker);
368+
}
369+
370+
private static bool IsNoteText(string marker)
371+
{
372+
return marker == "ft";
373+
}
374+
375+
public static bool IsEmbeddedPart(string marker)
376+
{
377+
return !(marker is null) && marker.Length > 0 && marker[0].IsOneOf('f', 'x', 'z');
378+
}
379+
380+
private static bool IsEmbeddedCharacter(string marker)
381+
{
382+
return marker.IsOneOf("f", "fe", "fig", "fm", "x");
383+
}
303384
}
304385
}

0 commit comments

Comments
 (0)