Skip to content

Commit

Permalink
1. refine img src url, allow for absolute and relative paths
Browse files Browse the repository at this point in the history
2. fix bugs in table extraction: 1) the last row with colspan is not well formatted in markdown/txt; 2) remove newlines in table cell markdown in special cases
  • Loading branch information
CodyInnowhere authored and CodyInnowhere committed Dec 5, 2024
1 parent 76200b7 commit 2d39b3e
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 15 deletions.
39 changes: 39 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,13 @@ def test_images():
assert extract('<html><body><article><p><img other="test.jpg" alt="text" title="a title"/></p></article></body></html>', include_images=True, fast=True) == ''
assert extract('<html><body><article><div><p><img data-src="test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
assert extract('<html><body><article><div><p><img data-src-small="test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
assert extract('<html><body><article><div><p><img src="https://a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](https://a.b/test.jpg)'

url = 'http://a.b/c/d.html'
assert extract('<html><body><article><div><p><img src="//a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', url=url, include_images=True, fast=True) == '![a title text](http://a.b/test.jpg)'
assert extract('<html><body><article><div><p><img src="/a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', url=url, include_images=True, fast=True) == '![a title text](http://a.b/a.b/test.jpg)'
assert extract('<html><body><article><div><p><img src="./a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', url=url, include_images=True, fast=True) == '![a title text](http://a.b/c/a.b/test.jpg)'
assert extract('<html><body><article><div><p><img src="../a.b/test.jpg" alt="text" title="a title"/></p></div></article></body></html>', url=url, include_images=True, fast=True) == '![a title text](http://a.b/a.b/test.jpg)'

assert handle_image(html.fromstring('<img src="data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" alt="text"></img>')) is None

Expand Down Expand Up @@ -1187,6 +1194,38 @@ def test_table_processing():
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == ""

htmlstring = """
<html><body><article>
<table>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr><td>a</td><td colspan="2">
<p>b</p>
<p>c</p>
</td></tr>
</table>
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c |\n| a | b c | |"

htmlstring = """
<html><body><article>
<table>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr><td>a</td><td colspan="2">
<p>b</p>
<p>c</p>
</td></tr>
<tr><td>a</td><td colspan="2">
<p>b</p>
<p>c</p>
</td></tr>
</table>
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c |\n| a | b c | |\n| a | b c | |"


def test_list_processing():
options = DEFAULT_OPTIONS
Expand Down
28 changes: 20 additions & 8 deletions trafilatura/main_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from lxml.etree import _Element, Element, SubElement, strip_elements, strip_tags, tostring
from lxml.html import HtmlElement

from urllib.parse import urljoin

# own
from .htmlprocessing import (delete_by_link_density, handle_textnode,
link_density_test_tables, process_node,
Expand Down Expand Up @@ -333,7 +335,7 @@ def handle_paragraphs(element: _Element, potential_tags: Set[str], options: Extr
newsub.text, newsub.tail = processed_child.text, processed_child.tail

if processed_child.tag == 'graphic':
image_elem = handle_image(processed_child)
image_elem = handle_image(processed_child, options)
if image_elem is not None:
newsub = image_elem
processed_element.append(newsub)
Expand Down Expand Up @@ -367,10 +369,16 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
# strip these structural elements
strip_tags(table_elem, "thead", "tbody", "tfoot")

# calculate maximum number of columns per row, includin colspan
# calculate maximum number of columns per row, including colspan
max_cols = 0
diff_colspans = set()
for tr in table_elem.iter('tr'):
max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS)))
total_colspans = 0
for td in tr.iter(TABLE_ELEMS):
colspan = int(td.get("colspan", 1))
diff_colspans.add(colspan)
total_colspans += colspan
max_cols = max(max_cols, total_colspans)

# explore sub-elements
seen_header_row = False
Expand Down Expand Up @@ -431,8 +439,9 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
# cleanup
subelement.tag = "done"

# clean up row attributes
newrow.attrib.pop("span", None)
# clean up row attributes only when all cells in table share the same colspan
if len(diff_colspans) == 1:
newrow.attrib.pop("span", None)

# end of processing
if len(newrow) > 0:
Expand All @@ -442,7 +451,7 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
return None


def handle_image(element: Optional[_Element]) -> Optional[_Element]:
def handle_image(element: Optional[_Element], options: Optional[Extractor] = None) -> Optional[_Element]:
"Process image elements and their relevant attributes."
if element is None:
return None
Expand Down Expand Up @@ -474,7 +483,10 @@ def handle_image(element: Optional[_Element]) -> Optional[_Element]:
# post-processing: URLs
src_attr = processed_element.get("src", "")
if not src_attr.startswith("http"):
processed_element.set("src", re.sub(r"^//", "http://", src_attr))
if options is not None and options.url is not None:
processed_element.set("src", urljoin(options.url, src_attr))
else:
processed_element.set("src", re.sub(r"^//", "http://", src_attr))

return processed_element

Expand Down Expand Up @@ -502,7 +514,7 @@ def handle_textelem(element: _Element, potential_tags: Set[str], options: Extrac
elif element.tag == 'table' and 'table' in potential_tags:
new_element = handle_table(element, potential_tags, options)
elif element.tag == 'graphic' and 'graphic' in potential_tags:
new_element = handle_image(element)
new_element = handle_image(element, options)
else:
# other elements (div, ??, ??)
new_element = handle_other_elements(element, potential_tags, options)
Expand Down
17 changes: 10 additions & 7 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,12 +285,15 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
else:
LOGGER.warning("empty link: %s %s", elem_text, element.attrib)
# cells
if element.tag == "cell" and elem_text and len(element) > 0:
if element[0].tag == 'p':
elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} "
elif element.tag == 'cell' and elem_text:
# add | before first cell
elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"
if element.tag == "cell":
elem_text = elem_text.strip()

if elem_text and len(element) > 0:
if element[0].tag == 'p':
elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} "
elif elem_text:
# add | before first cell
elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"
# lists
elif element.tag == "item" and elem_text:
elem_text = f"- {elem_text}\n"
Expand Down Expand Up @@ -348,7 +351,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting

# this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
if element.tail:
returnlist.append(element.tail)
returnlist.append(element.tail.strip() if element.tag == 'cell' else element.tail)


def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
Expand Down

0 comments on commit 2d39b3e

Please sign in to comment.