Skip to content

Fix freeze when reading corrupt XLSX files #12

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CONTRIBUTORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
In alphabetical order:

* `Mark Skelton <https://github.com/mtskelton>`_
* `Pierre-Louis Peeters <https://github.com/PLPeeters>`_
7 changes: 6 additions & 1 deletion changelog.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@ name: pyexcel-xlsxr
organisation: pyexcel
releases:
- changes:
- action: Updated
- action: Updated
details:
- 'Fix freeze when parsing certain corrupt XLSX files'
date: 26.06.2025
version: 0.6.2
- action: Updated
details:
- '#9: Potential fix for incorrect reading of data with empty cells when used with pyexcel '
date: 11.11.2024
Expand Down
11 changes: 6 additions & 5 deletions pyexcel_xlsxr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""
pyexcel_xlsxr
~~~~~~~~~~~~~~~~~~~
The lower level xlsx file format handler using lxml
:copyright: (c) 2015-2020 by Onni Software Ltd & its contributors
:license: New BSD License
pyexcel_xlsxr
~~~~~~~~~~~~~~~~~~~
The lower level xlsx file format handler using lxml
:copyright: (c) 2015-2020 by Onni Software Ltd & its contributors
:license: New BSD License
"""

from pyexcel_io.io import get_data as read_data
from pyexcel_io.io import isstream
from pyexcel_io.plugins import IOPluginInfoChainV2
Expand Down
4 changes: 2 additions & 2 deletions pyexcel_xlsxr/_version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__version__ = '0.6.1'
__author__ = 'C.W.'
__version__ = "0.6.1"
__author__ = "C.W."
30 changes: 13 additions & 17 deletions pyexcel_xlsxr/messy_xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,12 @@
STYLE_FILENAME = "xl/styles.xml"
SHARED_STRING = "xl/sharedStrings.xml"
WORK_BOOK = "xl/workbook.xml"
SHEET_MATCHER = "xl/worksheets/(work)?sheet([0-9]+)?.xml"
SHEET_INDEX_MATCHER = "xl/worksheets/(work)?sheet(([0-9]+)?).xml"
XLSX_ROW_MATCH = re.compile(rb".*?(<row.*?<\/.*?row>).*?", re.MULTILINE)
NUMBER_FMT_MATCHER = re.compile(
rb".*?(<numFmts.*?<\/.*?numFmts>).*?", re.MULTILINE
)
XFS_FMT_MATCHER = re.compile(
rb".*?(<cellXfs.*?<\/.*?cellXfs>).*?", re.MULTILINE
)
SHEET_FMT_MATCHER = re.compile(rb".*?(<sheet .*?\/>).*?", re.MULTILINE)
DATE_1904_MATCHER = re.compile(rb".*?(<workbookPr.*?\/>).*?", re.MULTILINE)
SHEET_MATCHER = re.compile(r"xl/worksheets/(?:work)?sheet([0-9]+)?.xml")
XLSX_ROW_MATCH = re.compile(rb"<row\b[^>]*>.*?</row>", re.DOTALL)
NUMBER_FMT_MATCHER = re.compile(rb"<numFmts\b[^>]*>.*?</numFmts>", re.DOTALL)
XFS_FMT_MATCHER = re.compile(rb"<cellXfs\b[^>]*>.*?</cellXfs>", re.DOTALL)
SHEET_FMT_MATCHER = re.compile(rb"<sheet\b.*?/>", re.DOTALL)
DATE_1904_MATCHER = re.compile(rb"<workbookPr\b.*?/>", re.DOTALL)
# "xmlns:x14ac="http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac"
# But it not used for now
X14AC_NAMESPACE = b'xmlns:x14ac="http://not.used.com/"'
Expand Down Expand Up @@ -158,14 +153,15 @@ def find_sheets(file_list):
return [
sheet_file
for sheet_file in file_list
if re.match(SHEET_MATCHER, sheet_file)
if SHEET_MATCHER.match(sheet_file)
]


def get_sheet_index(file_name):
if re.match(SHEET_MATCHER, file_name):
result = re.search(SHEET_INDEX_MATCHER, file_name)
index = int(result.group(3)) if result.group(3) else 1
sheet_match = SHEET_MATCHER.match(file_name)

if sheet_match:
index = int(sheet_match.group(1)) if sheet_match.group(1) else 1
return index - 1
else:
raise Exception("Invalid sheet file name")
Expand Down Expand Up @@ -338,10 +334,10 @@ def parse_book_properties(book_content):
)
namespaces = {"r": ns}

xlsx_header = u"<wrapper {0}>".format(
xlsx_header = "<wrapper {0}>".format(
" ".join('xmlns:{0}="{1}"'.format(k, v) for k, v in namespaces.items())
).encode("utf-8")
xlsx_footer = u"</wrapper>".encode("utf-8")
xlsx_footer = "</wrapper>".encode("utf-8")
sheets = SHEET_FMT_MATCHER.findall(book_content)
for sheet in sheets:
block = xlsx_header + sheet + xlsx_footer
Expand Down
4 changes: 4 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
black~=25.1
coverage~=7.9
pytest~=8.4
pyexcel~=0.7.3
2 changes: 1 addition & 1 deletion test.bat
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
pip freeze
nosetests --with-coverage --cover-package pyexcel_xlsxr --cover-package tests tests --with-doctest --doctest-extension=.rst README.rst pyexcel_xlsxr
coverage run -m --source=pyexcel_xlsxr pytest && coverage report --show-missing
2 changes: 1 addition & 1 deletion test.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#/bin/bash
pip freeze
nosetests --with-coverage --cover-package pyexcel_xlsxr --cover-package tests tests --with-doctest --doctest-extension=.rst README.rst pyexcel_xlsxr
coverage run -m --source=pyexcel_xlsxr pytest && coverage report --show-missing
221 changes: 108 additions & 113 deletions tests/test_bug_fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,123 +4,118 @@
from pyexcel_xlsxr import get_data
from pyexcel_io.reader import EncapsulatedSheetReader

from nose.tools import eq_


def test_issue_1():
test_file = get_fixture("issue_1.xlsx")
data = get_data(test_file)
data_array = [list(map(str, row)) for row in data["dataSheet1"]]
eq_(
data_array,
[
["", "D0"],
["Pads", "PADA"],
["Timestamp", "13:26:26.375087"],
["I", "V"],
["0.0", "0.7830809999999999"],
["1.0", "1.11145"],
["2.0", "1.176147"],
["3.0", "1.222229"],
["4.0", "1.25946"],
["5.0", "1.293334"],
["6.0", "1.323852"],
["7.0", "1.351623"],
["8.0", "1.3778679999999999"],
["9.0", "1.402893"],
["10.0", "1.427001"],
["11.0", "1.449279"],
["12.0", "1.471252"],
["13.0", "1.4923089999999999"],
["14.0", "1.512451"],
["15.0", "1.531982"],
["16.0", "1.551513"],
["17.0", "1.5701289999999999"],
["18.0", "1.588134"],
["19.0", "1.606445"],
["20.0", "1.623535"],
["21.0", "1.64093"],
["22.0", "1.657714"],
["23.0", "1.674804"],
["24.0", "1.6906729999999999"],
["25.0", "1.707153"],
["26.0", "1.7233269999999998"],
["27.0", "1.738586"],
["28.0", "1.7544549999999999"],
["29.0", "1.769104"],
["30.0", "1.784667"],
["31.0", "1.799316"],
["32.0", "1.8148799999999998"],
["33.0", "1.8286129999999998"],
["34.0", "1.8432609999999998"],
["35.0", "1.85791"],
["36.0", "1.871948"],
["37.0", "1.885986"],
["38.0", "1.900329"],
["39.0", "1.913452"],
["40.0", "1.92749"],
["41.0", "1.941223"],
["42.0", "1.954345"],
["43.0", "1.967773"],
["44.0", "1.9808949999999999"],
["45.0", "1.9940179999999998"],
["46.0", "2.007446"],
["47.0", "2.019958"],
["48.0", "2.03247"],
["49.0", "2.0455929999999998"],
["50.0", "2.05841"],
["51.0", "2.071228"],
["52.0", "2.083129"],
["53.0", "2.095336"],
["54.0", "2.1072379999999997"],
["55.0", "2.120056"],
["56.0", "2.131652"],
["57.0", "2.143859"],
["58.0", "2.156066"],
["59.0", "2.167663"],
["60.0", "2.1795649999999998"],
["61.0", "2.191162"],
["62.0", "2.2021479999999998"],
["63.0", "2.214355"],
["64.0", "2.225646"],
["65.0", "2.236633"],
["66.0", "2.247009"],
["67.0", "2.258911"],
["68.0", "2.269897"],
["69.0", "2.2808829999999998"],
["70.0", "2.2915639999999997"],
["71.0", "2.302246"],
["72.0", "2.3138419999999997"],
["73.0", "2.3245229999999997"],
["74.0", "2.334899"],
["75.0", "2.3455809999999997"],
["76.0", "2.356262"],
["77.0", "2.366333"],
["78.0", "2.376708"],
["79.0", "2.3864739999999998"],
["80.0", "2.3971549999999997"],
["81.0", "2.407531"],
["82.0", "2.417602"],
["83.0", "2.427673"],
["84.0", "2.438354"],
["85.0", "2.4472039999999997"],
["86.0", "2.457885"],
["87.0", "2.467956"],
["88.0", "2.477722"],
["89.0", "2.487487"],
["90.0", "2.4978629999999997"],
["91.0", "2.506408"],
["92.0", "2.515869"],
["93.0", "2.5256339999999997"],
["94.0", "2.535095"],
["95.0", "2.54425"],
["96.0", "2.5537099999999997"],
["97.0", "2.562866"],
["98.0", "2.572021"],
["99.0", "2.5805659999999997"],
["100.0", "2.589721"],
],
)
assert data_array == [
["", "D0"],
["Pads", "PADA"],
["Timestamp", "13:26:26.375087"],
["I", "V"],
["0.0", "0.7830809999999999"],
["1.0", "1.11145"],
["2.0", "1.176147"],
["3.0", "1.222229"],
["4.0", "1.25946"],
["5.0", "1.293334"],
["6.0", "1.323852"],
["7.0", "1.351623"],
["8.0", "1.3778679999999999"],
["9.0", "1.402893"],
["10.0", "1.427001"],
["11.0", "1.449279"],
["12.0", "1.471252"],
["13.0", "1.4923089999999999"],
["14.0", "1.512451"],
["15.0", "1.531982"],
["16.0", "1.551513"],
["17.0", "1.5701289999999999"],
["18.0", "1.588134"],
["19.0", "1.606445"],
["20.0", "1.623535"],
["21.0", "1.64093"],
["22.0", "1.657714"],
["23.0", "1.674804"],
["24.0", "1.6906729999999999"],
["25.0", "1.707153"],
["26.0", "1.7233269999999998"],
["27.0", "1.738586"],
["28.0", "1.7544549999999999"],
["29.0", "1.769104"],
["30.0", "1.784667"],
["31.0", "1.799316"],
["32.0", "1.8148799999999998"],
["33.0", "1.8286129999999998"],
["34.0", "1.8432609999999998"],
["35.0", "1.85791"],
["36.0", "1.871948"],
["37.0", "1.885986"],
["38.0", "1.900329"],
["39.0", "1.913452"],
["40.0", "1.92749"],
["41.0", "1.941223"],
["42.0", "1.954345"],
["43.0", "1.967773"],
["44.0", "1.9808949999999999"],
["45.0", "1.9940179999999998"],
["46.0", "2.007446"],
["47.0", "2.019958"],
["48.0", "2.03247"],
["49.0", "2.0455929999999998"],
["50.0", "2.05841"],
["51.0", "2.071228"],
["52.0", "2.083129"],
["53.0", "2.095336"],
["54.0", "2.1072379999999997"],
["55.0", "2.120056"],
["56.0", "2.131652"],
["57.0", "2.143859"],
["58.0", "2.156066"],
["59.0", "2.167663"],
["60.0", "2.1795649999999998"],
["61.0", "2.191162"],
["62.0", "2.2021479999999998"],
["63.0", "2.214355"],
["64.0", "2.225646"],
["65.0", "2.236633"],
["66.0", "2.247009"],
["67.0", "2.258911"],
["68.0", "2.269897"],
["69.0", "2.2808829999999998"],
["70.0", "2.2915639999999997"],
["71.0", "2.302246"],
["72.0", "2.3138419999999997"],
["73.0", "2.3245229999999997"],
["74.0", "2.334899"],
["75.0", "2.3455809999999997"],
["76.0", "2.356262"],
["77.0", "2.366333"],
["78.0", "2.376708"],
["79.0", "2.3864739999999998"],
["80.0", "2.3971549999999997"],
["81.0", "2.407531"],
["82.0", "2.417602"],
["83.0", "2.427673"],
["84.0", "2.438354"],
["85.0", "2.4472039999999997"],
["86.0", "2.457885"],
["87.0", "2.467956"],
["88.0", "2.477722"],
["89.0", "2.487487"],
["90.0", "2.4978629999999997"],
["91.0", "2.506408"],
["92.0", "2.515869"],
["93.0", "2.5256339999999997"],
["94.0", "2.535095"],
["95.0", "2.54425"],
["96.0", "2.5537099999999997"],
["97.0", "2.562866"],
["98.0", "2.572021"],
["99.0", "2.5805659999999997"],
["100.0", "2.589721"],
]


def test_issue_5():
Expand All @@ -131,7 +126,7 @@ def test_issue_5():

sheet = EncapsulatedSheetReader(XLSXSheet(native_sheet))
data = sheet.to_array()
eq_(list(data), [[None, 11, 11]])
assert list(data) == [[None, 11, 11]]


def get_fixture(file_name):
Expand Down
Loading