From 8309d65f4f89e99e5a35cb2153dc8bf70a6c8817 Mon Sep 17 00:00:00 2001 From: Roland Steinegger Date: Mon, 13 Jan 2025 15:21:28 +0100 Subject: [PATCH] Fixes convertion of line breaks in code blocks --- tests/unit_tests.py | 7 ++++--- trafilatura/xml.py | 8 ++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index cfe94583..4ee0be2b 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -354,15 +354,16 @@ def test_formatting(): Here is a code sample: `import trafilatura`""" - my_document = html.fromstring('

Title

Here is a code sample:

import trafilatura\ntrafilatura.extract("")

Sometimes code is wrapped using pre and code:

import trafilatura\ntrafilatura.extract("")

Less often code is wrapped using just pre:

\n    trafilatura.extract("")
') + my_document = html.fromstring('

Title

Here is a code sample:

import something
something.run("somewhere")

Sometimes code is wrapped using pre and code:

import trafilatura\ntrafilatura.extract("")

Less often code is wrapped using just pre:

\n    trafilatura.extract("")
') my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG) + print(my_result) assert my_result == """### Title Here is a code sample: ``` -import trafilatura -trafilatura.extract("") +import something +something.run("somewhere") ``` Sometimes code is wrapped using `pre` and `code`: diff --git a/trafilatura/xml.py b/trafilatura/xml.py index fb3129de..a37fdcd4 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -251,8 +251,8 @@ def validate_tei(xmldoc: _Element) -> bool: def replace_element_text(element: _Element, include_formatting: bool) -> str: - "Determine element text based on just the text of the element. One must deal with the tail separately." elem_text = element.text or "" + "Determine element text based on just the text of the element. One must deal with the tail separately." # handle formatting: convert to markdown if include_formatting and element.text: if element.tag == "head": @@ -268,7 +268,11 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str: if rend in HI_FORMATTING: elem_text = f"{HI_FORMATTING[rend]}{elem_text}{HI_FORMATTING[rend]}" elif element.tag == "code": - if "\n" in element.text: + if "\n" in elem_text or element.xpath(".//lb"): # Handle
inside + # Convert
to \n within code blocks + for lb in element.xpath(".//lb"): + elem_text = f"{elem_text}\n{lb.tail}" + lb.getparent().remove(lb) elem_text = f"```\n{elem_text}\n```\n" else: elem_text = f"`{elem_text}`"