Skip to content

Commit

Permalink
Fixes convertion of line breaks in code blocks
Browse files Browse the repository at this point in the history
  • Loading branch information
steineggerroland committed Jan 13, 2025
1 parent 25085f7 commit 8309d65
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 5 deletions.
7 changes: 4 additions & 3 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,15 +354,16 @@ def test_formatting():
Here is a code sample:
`import trafilatura`"""
my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code>import trafilatura\ntrafilatura.extract("")</code><p>Sometimes code is wrapped using <code>pre</code> and <code>code</code>:</p><pre><code>import trafilatura\ntrafilatura.extract("")</code></pre><p>Less often code is wrapped using just <code>pre</code>:</p><pre>\n trafilatura.extract("")</pre></article></body></html>')
my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code><span>import</span> <span>something</span><br/>something.run("somewhere")</code><p>Sometimes code is wrapped using <code>pre</code> and <code>code</code>:</p><pre><code>import trafilatura\ntrafilatura.extract("")</code></pre><p>Less often code is wrapped using just <code>pre</code>:</p><pre>\n trafilatura.extract("")</pre></article></body></html>')
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
print(my_result)
assert my_result == """### Title
Here is a code sample:
```
import trafilatura
trafilatura.extract("")
import something
something.run("somewhere")
```
Sometimes code is wrapped using `pre` and `code`:
Expand Down
8 changes: 6 additions & 2 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,8 @@ def validate_tei(xmldoc: _Element) -> bool:


def replace_element_text(element: _Element, include_formatting: bool) -> str:
"Determine element text based on just the text of the element. One must deal with the tail separately."
elem_text = element.text or ""
"Determine element text based on just the text of the element. One must deal with the tail separately."
# handle formatting: convert to markdown
if include_formatting and element.text:
if element.tag == "head":
Expand All @@ -268,7 +268,11 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
if rend in HI_FORMATTING:
elem_text = f"{HI_FORMATTING[rend]}{elem_text}{HI_FORMATTING[rend]}"
elif element.tag == "code":
if "\n" in element.text:
if "\n" in elem_text or element.xpath(".//lb"): # Handle <br> inside <code>
# Convert <br> to \n within code blocks
for lb in element.xpath(".//lb"):
elem_text = f"{elem_text}\n{lb.tail}"
lb.getparent().remove(lb)
elem_text = f"```\n{elem_text}\n```\n"
else:
elem_text = f"`{elem_text}`"
Expand Down

0 comments on commit 8309d65

Please sign in to comment.