Fix ssml markup additions to non-ssml text

Note, this is perhaps not the best way to indicate the current text format.
mideind · Dec 6, 2024 · 13c84a0 · 13c84a0
1 parent df1991b
commit 13c84a0
Showing 1 changed file with 24 additions and 11 deletions.
diff --git a/src/icespeak/transcribe/__init__.py b/src/icespeak/transcribe/__init__.py
@@ -48,7 +48,7 @@
     PunctuationTuple,
 )
 
-from icespeak.settings import TRACE
+from icespeak.settings import SETTINGS, TRACE, TextFormats
 
 from .num import (
     ROMAN_NUMERALS,
@@ -874,6 +874,7 @@ def spell(
         cls,
         txt: str,
         *,
+        text_format: TextFormats = SETTINGS.DEFAULT_TEXT_FORMAT,
         pause_length: str | None = None,
         literal: bool = False,
     ) -> str:
@@ -890,15 +891,18 @@ def spell(
             # Non-literal spelling
             f = lambda c: cls._CHAR_PRONUNCIATION.get(c.lower(), c) if not c.isspace() else ""
         t = tuple(map(f, txt))
-        return (
-            cls.vbreak(time="10ms")
-            + cls.vbreak(time=pause_length or "20ms").join(t)
-            + cls.vbreak(time="20ms" if len(t) > 1 else "10ms")
-        )
+        if text_format == "ssml":
+            return (
+                cls.vbreak(time="10ms")
+                + cls.vbreak(time=pause_length or "20ms").join(t)
+                + cls.vbreak(time="20ms" if len(t) > 1 else "10ms")
+            )
+        else:
+            return " ".join(t)
 
     @classmethod
     @_transcribe_method
-    def abbrev(cls, txt: str) -> str:
+    def abbrev(cls, txt: str, *, text_format: TextFormats = SETTINGS.DEFAULT_TEXT_FORMAT) -> str:
         """Expand an abbreviation."""
         meanings = tuple(
             filter(
@@ -908,7 +912,10 @@ def abbrev(cls, txt: str) -> str:
         )
         if meanings:
             # Abbreviation has at least one known meaning, expand it
-            return cls.vbreak(time="10ms") + meanings[0].stofn + cls.vbreak(time="50ms")
+            if text_format == "ssml":
+                return cls.vbreak(time="10ms") + meanings[0].stofn + cls.vbreak(time="50ms")
+            else:
+                return meanings[0].stofn
 
         # Fallbacks:
         # - Spell out, if any letter is uppercase (e.g. "MSc")
@@ -1111,7 +1118,9 @@ def entity(cls, txt: str) -> str:
     @_transcribe_method
     @_bool_args("full_text")
     @lru_cache(maxsize=50)  # Caching, as this method could be slow
-    def parser_transcribe(cls, txt: str, *, full_text: bool = False) -> str:
+    def parser_transcribe(
+        cls, txt: str, *, full_text: bool = False, text_format: TextFormats = SETTINGS.DEFAULT_TEXT_FORMAT
+    ) -> str:
         """
         Slow transcription of Icelandic text for TTS.
         Utilizes the parser from the GreynirPackage library.
@@ -1175,7 +1184,11 @@ def _percent(tok: Tok, term: SimpleTree | None) -> str:
 
         def _numwletter(tok: Tok, term: SimpleTree | None) -> str:
             num = "".join(filter(lambda c: c.isdecimal(), tok.txt))
-            return cls.number(num, case="nf", gender="hk") + " " + cls.spell(tok.txt[len(num) + 1 :])
+            return (
+                cls.number(num, case="nf", gender="hk")
+                + " "
+                + cls.spell(tok.txt[len(num) + 1 :], text_format=text_format)
+            )
 
         # Map certain terminals directly to transcription functions
         handler_map: Mapping[int, Callable[[Tok, SimpleTree | None], str]] = {
@@ -1235,7 +1248,7 @@ def _numwletter(tok: Tok, term: SimpleTree | None) -> str:
                     or any(not _ABBREV_RE.match(m.stofn) for m in tok.meanings)
                 ):
                     # Probably an abbreviation such as "t.d." or "MSc"
-                    s_parts.append(cls.abbrev(txt))
+                    s_parts.append(cls.abbrev(txt, text_format=text_format))
 
                 # Check whether this is a hyphen denoting a range
                 elif (