diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 359a24eb..29fa04b0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,4 +24,4 @@ jobs: - run: py.test --cov . - env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: coveralls + run: coveralls --service=github diff --git a/CHANGELOG.md b/CHANGELOG.md index ce5eb884..b916ee1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Fixed +- flattening: Uses much less memory by storing data in a embedded ZODB database, using ijson and using write only mode in pyopenxl. - use-titles: Use $ref'erring title if available https://github.com/OpenDataServices/flatten-tool/pull/368 - create-template --no-deprecated-fields: Did not work if deprecated element at same level as a $ref https://github.com/OpenDataServices/flatten-tool/issues/185#issuecomment-719587348 diff --git a/flattentool/__init__.py b/flattentool/__init__.py index 5c4f4bbf..b700353a 100644 --- a/flattentool/__init__.py +++ b/flattentool/__init__.py @@ -112,7 +112,7 @@ def flatten( else: schema_parser = None - parser = JSONParser( + with JSONParser( json_filename=input_name, root_list_path=None if root_is_list else root_list_path, schema_parser=schema_parser, @@ -126,33 +126,33 @@ def flatten( preserve_fields=preserve_fields, remove_empty_schema_columns=remove_empty_schema_columns, truncation_length=truncation_length, - ) - parser.parse() - - def spreadsheet_output(spreadsheet_output_class, name): - spreadsheet_output = spreadsheet_output_class( - parser=parser, - main_sheet_name=main_sheet_name, - output_name=name, - sheet_prefix=sheet_prefix, - ) - spreadsheet_output.write_sheets() - - if output_format == "all": - if not output_name: - output_name = "flattened" - for format_name, spreadsheet_output_class in OUTPUT_FORMATS.items(): - spreadsheet_output( - spreadsheet_output_class, output_name + FORMATS_SUFFIX[format_name] + persist=True, + ) as parser: + + def spreadsheet_output(spreadsheet_output_class, name): + spreadsheet_output = spreadsheet_output_class( + parser=parser, + main_sheet_name=main_sheet_name, + output_name=name, + sheet_prefix=sheet_prefix, ) + spreadsheet_output.write_sheets() + + if output_format == "all": + if not output_name: + output_name = "flattened" + for format_name, spreadsheet_output_class in OUTPUT_FORMATS.items(): + spreadsheet_output( + spreadsheet_output_class, output_name + FORMATS_SUFFIX[format_name] + ) - elif output_format in OUTPUT_FORMATS.keys(): # in dictionary of allowed formats - if not output_name: - output_name = "flattened" + FORMATS_SUFFIX[output_format] - spreadsheet_output(OUTPUT_FORMATS[output_format], output_name) + elif output_format in OUTPUT_FORMATS.keys(): # in dictionary of allowed formats + if not output_name: + output_name = "flattened" + FORMATS_SUFFIX[output_format] + spreadsheet_output(OUTPUT_FORMATS[output_format], output_name) - else: - raise Exception("The requested format is not available") + else: + raise Exception("The requested format is not available") # From http://bugs.python.org/issue16535 diff --git a/flattentool/json_input.py b/flattentool/json_input.py index fa9634d8..74992e3b 100644 --- a/flattentool/json_input.py +++ b/flattentool/json_input.py @@ -7,18 +7,24 @@ import codecs import copy -import json import os +import tempfile +import uuid from collections import OrderedDict from decimal import Decimal from warnings import warn +import BTrees.OOBTree +import ijson +import transaction import xmltodict +import zc.zlibstorage +import ZODB.FileStorage from flattentool.i18n import _ from flattentool.input import path_search from flattentool.schema import make_sub_sheet_name -from flattentool.sheet import Sheet +from flattentool.sheet import PersistentSheet BASIC_TYPES = [str, bool, int, Decimal, type(None)] @@ -112,9 +118,26 @@ def __init__( remove_empty_schema_columns=False, rollup=False, truncation_length=3, + persist=False, ): + if persist: + self.zodb_db_location = ( + tempfile.gettempdir() + "/flattentool-" + str(uuid.uuid4()) + ) + zodb_storage = zc.zlibstorage.ZlibStorage( + ZODB.FileStorage.FileStorage(self.zodb_db_location) + ) + self.db = ZODB.DB(zodb_storage) + else: + # If None, in memory storage is used. + self.db = ZODB.DB(None) + + self.connection = self.db.open() + root = self.connection.root + root.sheet_store = BTrees.OOBTree.BTree() + self.sub_sheets = {} - self.main_sheet = Sheet() + self.main_sheet = PersistentSheet(connection=self.connection, name="") self.root_list_path = root_list_path self.root_id = root_id self.use_titles = use_titles @@ -125,9 +148,17 @@ def __init__( self.filter_value = filter_value self.remove_empty_schema_columns = remove_empty_schema_columns self.seen_paths = set() + self.persist = persist if schema_parser: - self.main_sheet = copy.deepcopy(schema_parser.main_sheet) + self.main_sheet = PersistentSheet.from_sheet( + schema_parser.main_sheet, self.connection + ) + for sheet_name, sheet in list(self.sub_sheets.items()): + self.sub_sheets[sheet_name] = PersistentSheet.from_sheet( + sheet, self.connection + ) + self.sub_sheets = copy.deepcopy(schema_parser.sub_sheets) if remove_empty_schema_columns: # Don't use columns from the schema parser @@ -194,18 +225,13 @@ def __init__( _("Only one of json_file or root_json_dict should be supplied") ) - if json_filename: - with codecs.open(json_filename, encoding="utf-8") as json_file: - try: - self.root_json_dict = json.load( - json_file, object_pairs_hook=OrderedDict, parse_float=Decimal - ) - except UnicodeError as err: - raise BadlyFormedJSONErrorUTF8(*err.args) - except ValueError as err: - raise BadlyFormedJSONError(*err.args) - else: - self.root_json_dict = root_json_dict + if not json_filename: + if self.root_list_path is None: + self.root_json_list = root_json_dict + else: + self.root_json_list = path_search( + root_json_dict, self.root_list_path.split("/") + ) if preserve_fields: # Extract fields to be preserved from input file (one path per line) @@ -240,19 +266,37 @@ def __init__( self.preserve_fields = None self.preserve_fields_input = None + if json_filename: + if self.root_list_path is None: + path = "item" + else: + path = root_list_path.replace("/", ".") + ".item" + + json_file = codecs.open(json_filename, encoding="utf-8") + + self.root_json_list = ijson.items(json_file, path) + + try: + self.parse() + except ijson.common.IncompleteJSONError as err: + raise BadlyFormedJSONError(*err.args) + except UnicodeDecodeError as err: + raise BadlyFormedJSONErrorUTF8(*err.args) + finally: + if json_filename: + json_file.close() + def parse(self): - if self.root_list_path is None: - root_json_list = self.root_json_dict - else: - root_json_list = path_search( - self.root_json_dict, self.root_list_path.split("/") - ) - for json_dict in root_json_list: + for num, json_dict in enumerate(self.root_json_list): if json_dict is None: # This is particularly useful for IATI XML, in order to not # fall over on empty activity, e.g. continue self.parse_json_dict(json_dict, sheet=self.main_sheet) + if num % 2000 == 0 and num != 0: + transaction.commit() + + transaction.commit() if self.remove_empty_schema_columns: # Remove sheets with no lines of data @@ -501,7 +545,9 @@ def parse_json_dict( parent_name, key, truncation_length=self.truncation_length ) if sub_sheet_name not in self.sub_sheets: - self.sub_sheets[sub_sheet_name] = Sheet(name=sub_sheet_name) + self.sub_sheets[sub_sheet_name] = PersistentSheet( + name=sub_sheet_name, connection=self.connection + ) for json_dict in value: if json_dict is None: @@ -518,4 +564,16 @@ def parse_json_dict( raise ValueError(_("Unsupported type {}").format(type(value))) if top: - sheet.lines.append(flattened_dict) + sheet.append_line(flattened_dict) + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + if self.persist: + self.connection.close() + self.db.close() + os.remove(self.zodb_db_location) + os.remove(self.zodb_db_location + ".lock") + os.remove(self.zodb_db_location + ".index") + os.remove(self.zodb_db_location + ".tmp") diff --git a/flattentool/output.py b/flattentool/output.py index b92b0d02..947ceac6 100644 --- a/flattentool/output.py +++ b/flattentool/output.py @@ -50,7 +50,7 @@ def close(self): class XLSXOutput(SpreadsheetOutput): def open(self): - self.workbook = openpyxl.Workbook() + self.workbook = openpyxl.Workbook(write_only=True) def write_sheet(self, sheet_name, sheet): sheet_header = list(sheet) @@ -75,7 +75,6 @@ def write_sheet(self, sheet_name, sheet): worksheet.append(line) def close(self): - self.workbook.remove(self.workbook.active) self.workbook.save(self.output_name) diff --git a/flattentool/sheet.py b/flattentool/sheet.py index 05f2159a..df6b99be 100644 --- a/flattentool/sheet.py +++ b/flattentool/sheet.py @@ -1,3 +1,8 @@ +import copy + +import BTrees.IOBTree + + class Sheet(object): """ An abstract representation of a single sheet of a spreadsheet. @@ -8,10 +13,14 @@ def __init__(self, columns=None, root_id="", name=None): self.id_columns = [] self.columns = columns if columns else [] self.titles = {} - self.lines = [] + self._lines = [] self.root_id = root_id self.name = name + @property + def lines(self): + return self._lines + def add_field(self, field, id_field=False): columns = self.id_columns if id_field else self.columns if field not in columns: @@ -27,3 +36,39 @@ def __iter__(self): yield column for column in self.columns: yield column + + def append_line(self, flattened_dict): + self._lines.append(flattened_dict) + + +class PersistentSheet(Sheet): + """ + A sheet that is persisted in ZODB database. + + """ + + def __init__(self, columns=None, root_id="", name=None, connection=None): + super().__init__(columns=columns, root_id=root_id, name=name) + self.connection = connection + self.index = 0 + connection.root.sheet_store[self.name] = BTrees.IOBTree.BTree() + + @property + def lines(self): + for key, value in self.connection.root.sheet_store[self.name].items(): + if key % 5000 == 0: + self.connection.cacheMinimize() + yield value + + def append_line(self, flattened_dict): + self.connection.root.sheet_store[self.name][self.index] = flattened_dict + self.index += 1 + + @classmethod + def from_sheet(cls, sheet, connection): + instance = cls(name=sheet.name, connection=connection) + instance.id_columns = copy.deepcopy(sheet.id_columns) + instance.columns = copy.deepcopy(sheet.columns) + instance.titles = copy.deepcopy(sheet.titles) + instance.root_id = sheet.root_id + return instance diff --git a/flattentool/tests/test_json_input.py b/flattentool/tests/test_json_input.py index 738d36bd..35357863 100644 --- a/flattentool/tests/test_json_input.py +++ b/flattentool/tests/test_json_input.py @@ -59,30 +59,29 @@ def test_jsonparser_arguments_exceptions(tmpdir): def test_json_filename(tmpdir): test_json = tmpdir.join("test.json") - test_json.write('{"a":"b"}') + test_json.write('[{"a":"b"}]') parser = JSONParser(json_filename=test_json.strpath) - assert parser.root_json_dict == {"a": "b"} + assert list(parser.main_sheet.lines) == [{"a": "b"}] def test_json_filename_utf8(tmpdir): test_json = tmpdir.join("test.json") - test_json.write_text('{"a":"éαГ😼𝒞人"}', encoding="utf-8") + test_json.write_text('[{"a":"éαГ😼𝒞人"}]', encoding="utf-8") parser = JSONParser(json_filename=test_json.strpath) - assert parser.root_json_dict == {"a": "éαГ😼𝒞人"} + assert list(parser.main_sheet.lines) == [{"a": "éαГ😼𝒞人"}] def test_json_filename_ordered(tmpdir): test_json = tmpdir.join("test.json") - test_json.write('{"a":"b", "c": "d"}') + test_json.write('[{"a":"b", "c": "d"}]') parser = JSONParser(json_filename=test_json.strpath) - assert list(parser.root_json_dict.items()) == [("a", "b"), ("c", "d")] + assert list(parser.main_sheet.lines) == [{"a": "b", "c": "d"}] def test_parse_empty_json_dict(): parser = JSONParser(root_json_dict={}) - parser.parse() assert list(parser.main_sheet) == [] - assert parser.main_sheet.lines == [] + assert list(parser.main_sheet.lines) == [] assert parser.sub_sheets == {} @@ -93,9 +92,8 @@ def test_parse_basic_json_dict(): OrderedDict([("a", "e"), ("c", "f"),]), ] ) - parser.parse() assert list(parser.main_sheet) == ["a", "c"] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ {"a": "b", "c": "d"}, {"a": "e", "c": "f"}, ] @@ -106,9 +104,8 @@ def test_parse_nested_dict_json_dict(): parser = JSONParser( root_json_dict=[OrderedDict([("a", "b"), ("c", OrderedDict([("d", "e")])),])] ) - parser.parse() assert list(parser.main_sheet) == ["a", "c/d"] - assert parser.main_sheet.lines == [{"a": "b", "c/d": "e"}] + assert list(parser.main_sheet.lines) == [{"a": "b", "c/d": "e"}] assert parser.sub_sheets == {} @@ -116,9 +113,8 @@ def test_parse_nested_list_json_dict(): parser = JSONParser( root_json_dict=[OrderedDict([("a", "b"), ("c", [OrderedDict([("d", "e")])]),])] ) - parser.parse() assert list(parser.main_sheet) == ["a"] - assert parser.main_sheet.lines == [{"a": "b"}] + assert list(parser.main_sheet.lines) == [{"a": "b"}] listify(parser.sub_sheets) == {"c": ["d"]} parser.sub_sheets["c"].lines == [{"d": "e"}] @@ -127,9 +123,8 @@ def test_parse_array(): parser = JSONParser( root_json_dict=[OrderedDict([("testarray", ["item", "anotheritem", 42])])] ) - parser.parse() assert list(parser.main_sheet) == ["testarray"] - assert parser.main_sheet.lines == [{"testarray": "item;anotheritem;42"}] + assert list(parser.main_sheet.lines) == [{"testarray": "item;anotheritem;42"}] assert parser.sub_sheets == {} @@ -138,9 +133,8 @@ def test_root_list_path(): root_json_dict={"custom_key": [OrderedDict([("a", "b"), ("c", "d"),])]}, root_list_path="custom_key", ) - parser.parse() assert list(parser.main_sheet) == ["a", "c"] - assert parser.main_sheet.lines == [{"a": "b", "c": "d"}] + assert list(parser.main_sheet.lines) == [{"a": "b", "c": "d"}] assert parser.sub_sheets == {} @@ -169,11 +163,12 @@ def test_parse_ids(self): ], root_id="ocid", ) - parser.parse() assert list(parser.main_sheet) == ["ocid", "id", "a", "f/g"] - assert parser.main_sheet.lines == [{"ocid": 1, "id": 2, "a": "b", "f/g": "h"}] + assert list(parser.main_sheet.lines) == [ + {"ocid": 1, "id": 2, "a": "b", "f/g": "h"} + ] listify(parser.sub_sheets) == {"c": ["ocid", "id", "c/0/id", "c/0/d"]} - assert parser.sub_sheets["c"].lines == [ + assert list(parser.sub_sheets["c"].lines) == [ {"ocid": 1, "id": 2, "c/0/id": 3, "c/0/d": "e"}, {"ocid": 1, "id": 2, "c/0/id": 3, "c/0/d": "e2"}, ] @@ -212,9 +207,8 @@ def test_parse_ids_subsheet(self): ], root_id="ocid", ) - parser.parse() assert list(parser.main_sheet) == ["ocid", "id"] - assert parser.main_sheet.lines == [{"ocid": 1, "id": 2,}] + assert list(parser.main_sheet.lines) == [{"ocid": 1, "id": 2,}] assert listify(parser.sub_sheets) == { "testnest": [ "ocid", @@ -225,7 +219,7 @@ def test_parse_ids_subsheet(self): ], "tes_c": ["ocid", "id", "testnest/0/id", "testnest/0/c/0/d"], } - assert parser.sub_sheets["testnest"].lines == [ + assert list(parser.sub_sheets["testnest"].lines) == [ { "ocid": 1, "id": 2, @@ -234,7 +228,7 @@ def test_parse_ids_subsheet(self): "testnest/0/f/g": "h", }, ] - assert parser.sub_sheets["tes_c"].lines == [ + assert list(parser.sub_sheets["tes_c"].lines) == [ {"ocid": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e"}, {"ocid": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e2"}, ] @@ -271,15 +265,14 @@ def test_parse_ids_nested(self): ], root_id="ocid", ) - parser.parse() assert list(parser.main_sheet) == ["ocid", "id", "a", "testnest/id", "f/g"] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ {"ocid": 1, "id": 2, "a": "b", "testnest/id": 3, "f/g": "h"} ] assert listify(parser.sub_sheets) == { "tes_c": ["ocid", "id", "testnest/id", "testnest/c/0/d"] } - assert parser.sub_sheets["tes_c"].lines == [ + assert list(parser.sub_sheets["tes_c"].lines) == [ {"ocid": 1, "id": 2, "testnest/id": 3, "testnest/c/0/d": "e"}, {"ocid": 1, "id": 2, "testnest/id": 3, "testnest/c/0/d": "e2"}, ] @@ -326,9 +319,8 @@ def test_sub_sheets(self, tmpdir, remove_empty_schema_columns): schema_parser=schema_parser, remove_empty_schema_columns=remove_empty_schema_columns, ) - parser.parse() assert list(parser.main_sheet) == ["a"] - assert parser.main_sheet.lines == [{"a": "b"}] + assert list(parser.main_sheet.lines) == [{"a": "b"}] assert len(parser.sub_sheets) == 2 if not remove_empty_schema_columns else 1 if not remove_empty_schema_columns: assert list(parser.sub_sheets["c"]) == list(["ocid", "c/0/d", "c/0/f"]) @@ -352,11 +344,10 @@ def test_column_matching(self, tmpdir): schema_parser = SchemaParser(schema_filename=test_schema.strpath) schema_parser.parse() parser = JSONParser( - root_json_dict=[OrderedDict([("c", ["d"]),])], schema_parser=schema_parser + root_json_dict=[OrderedDict([("c", ["d"]),])], schema_parser=schema_parser, ) - parser.parse() assert list(parser.main_sheet) == ["c"] - assert parser.main_sheet.lines == [{"c": "d"}] + assert list(parser.main_sheet.lines) == [{"c": "d"}] assert len(parser.sub_sheets) == 0 def test_rollup(self): @@ -390,9 +381,8 @@ def test_rollup(self): root_id="ocid", rollup=True, ) - parser.parse() assert list(parser.main_sheet) == ["testA/0/testB"] - assert parser.main_sheet.lines == [{"testA/0/testB": "1"}] + assert list(parser.main_sheet.lines) == [{"testA/0/testB": "1"}] assert len(parser.sub_sheets) == 1 assert set(parser.sub_sheets["testA"]) == set( ["ocid", "testA/0/testB", "testA/0/testC"] @@ -438,9 +428,8 @@ def test_rollup_multiple_values(self, recwarn): schema_parser=schema_parser, rollup=True, ) - parser.parse() assert list(parser.main_sheet) == ["testA/0/testB"] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ { "testA/0/testB": "WARNING: More than one value supplied, consult the relevant sub-sheet for the data." } @@ -502,7 +491,6 @@ def test_two_parents(self): ], schema_parser=schema_parser, ) - parser.parse() assert set(parser.main_sheet) == set() assert set(parser.sub_sheets) == set( ["Atest", "Dtest", "Ate_Btest", "Dte_Btest"] @@ -547,11 +535,12 @@ def test_parse_ids(self): ], root_id="custom", ) - parser.parse() assert list(parser.main_sheet) == ["custom", "id", "a", "f/g"] - assert parser.main_sheet.lines == [{"custom": 1, "id": 2, "a": "b", "f/g": "h"}] + assert list(parser.main_sheet.lines) == [ + {"custom": 1, "id": 2, "a": "b", "f/g": "h"} + ] assert listify(parser.sub_sheets) == {"c": ["custom", "id", "c/0/id", "c/0/d"]} - assert parser.sub_sheets["c"].lines == [ + assert list(parser.sub_sheets["c"].lines) == [ {"custom": 1, "id": 2, "c/0/id": 3, "c/0/d": "e"}, {"custom": 1, "id": 2, "c/0/id": 3, "c/0/d": "e2"}, ] @@ -590,9 +579,8 @@ def test_parse_ids_subsheet(self): ], root_id="custom", ) - parser.parse() assert list(parser.main_sheet) == ["custom", "id"] - assert parser.main_sheet.lines == [{"custom": 1, "id": 2,}] + assert list(parser.main_sheet.lines) == [{"custom": 1, "id": 2,}] assert listify(parser.sub_sheets) == { "testnest": [ "custom", @@ -603,7 +591,7 @@ def test_parse_ids_subsheet(self): ], "tes_c": ["custom", "id", "testnest/0/id", "testnest/0/c/0/d"], } - assert parser.sub_sheets["testnest"].lines == [ + assert list(parser.sub_sheets["testnest"].lines) == [ { "custom": 1, "id": 2, @@ -612,7 +600,7 @@ def test_parse_ids_subsheet(self): "testnest/0/f/g": "h", }, ] - assert parser.sub_sheets["tes_c"].lines == [ + assert list(parser.sub_sheets["tes_c"].lines) == [ {"custom": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e"}, {"custom": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e2"}, ] @@ -649,15 +637,14 @@ def test_parse_ids_nested(self): ], root_id="custom", ) - parser.parse() assert list(parser.main_sheet) == ["custom", "id", "a", "testnest/id", "f/g"] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ {"custom": 1, "id": 2, "a": "b", "testnest/id": 3, "f/g": "h"} ] assert listify(parser.sub_sheets) == { "tes_c": ["custom", "id", "testnest/id", "testnest/c/0/d"] } - assert parser.sub_sheets["tes_c"].lines == [ + assert list(parser.sub_sheets["tes_c"].lines) == [ {"custom": 1, "id": 2, "testnest/id": 3, "testnest/c/0/d": "e"}, {"custom": 1, "id": 2, "testnest/id": 3, "testnest/c/0/d": "e2"}, ] @@ -687,11 +674,10 @@ def test_parse_ids(self): ], root_id="", ) - parser.parse() assert list(parser.main_sheet) == ["id", "a", "f/g"] - assert parser.main_sheet.lines == [{"id": 2, "a": "b", "f/g": "h"}] + assert list(parser.main_sheet.lines) == [{"id": 2, "a": "b", "f/g": "h"}] assert listify(parser.sub_sheets) == {"c": ["id", "c/0/id", "c/0/d"]} - assert parser.sub_sheets["c"].lines == [ + assert list(parser.sub_sheets["c"].lines) == [ {"id": 2, "c/0/id": 3, "c/0/d": "e"}, {"id": 2, "c/0/id": 3, "c/0/d": "e2"}, ] @@ -729,17 +715,16 @@ def test_parse_ids_subsheet(self): ], root_id="", ) - parser.parse() assert list(parser.main_sheet) == ["id"] - assert parser.main_sheet.lines == [{"id": 2,}] + assert list(parser.main_sheet.lines) == [{"id": 2,}] assert listify(parser.sub_sheets) == { "testnest": ["id", "testnest/0/id", "testnest/0/a", "testnest/0/f/g"], "tes_c": ["id", "testnest/0/id", "testnest/0/c/0/d"], } - assert parser.sub_sheets["testnest"].lines == [ + assert list(parser.sub_sheets["testnest"].lines) == [ {"id": 2, "testnest/0/id": 3, "testnest/0/a": "b", "testnest/0/f/g": "h",}, ] - assert parser.sub_sheets["tes_c"].lines == [ + assert list(parser.sub_sheets["tes_c"].lines) == [ {"id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e"}, {"id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e2"}, ] @@ -775,15 +760,14 @@ def test_parse_ids_nested(self): ], root_id="", ) - parser.parse() assert list(parser.main_sheet) == ["id", "a", "testnest/id", "f/g"] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ {"id": 2, "a": "b", "testnest/id": 3, "f/g": "h"} ] assert listify(parser.sub_sheets) == { "tes_c": ["id", "testnest/id", "testnest/c/0/d"] } - assert parser.sub_sheets["tes_c"].lines == [ + assert list(parser.sub_sheets["tes_c"].lines) == [ {"id": 2, "testnest/id": 3, "testnest/c/0/d": "e"}, {"id": 2, "testnest/id": 3, "testnest/c/0/d": "e2"}, ] diff --git a/flattentool/tests/test_json_input_is_unflatten_reversed.py b/flattentool/tests/test_json_input_is_unflatten_reversed.py index cdd6a9a5..3007e2e2 100644 --- a/flattentool/tests/test_json_input_is_unflatten_reversed.py +++ b/flattentool/tests/test_json_input_is_unflatten_reversed.py @@ -80,7 +80,6 @@ def test_flatten( schema_parser=schema_parser, **extra_kwargs ) - parser.parse() expected_output_list = [ inject_root_id(root_id, expected_output_dict) @@ -188,7 +187,6 @@ def test_flatten_multiplesheets( schema_parser=schema_parser, **extra_kwargs ) - parser.parse() expected_output_dict = OrderedDict( [ @@ -197,11 +195,11 @@ def test_flatten_multiplesheets( ] ) output = { - sheet_name: sheet.lines + sheet_name: list(sheet.lines) for sheet_name, sheet in parser.sub_sheets.items() - if sheet.lines + if list(sheet.lines) } - output["custom_main"] = parser.main_sheet.lines + output["custom_main"] = list(parser.main_sheet.lines) assert output == expected_output_dict diff --git a/flattentool/tests/test_output.py b/flattentool/tests/test_output.py index 023ce09b..af786fb7 100644 --- a/flattentool/tests/test_output.py +++ b/flattentool/tests/test_output.py @@ -102,7 +102,7 @@ def test_empty_lines(tmpdir): subsheet = Sheet(root_id="ocid") subsheet.add_field("c") parser = MockParser(["a", "d"], {"b": subsheet}) - parser.main_sheet.lines = [] + parser.main_sheet._lines = [] for format_name, spreadsheet_output_class in output.FORMATS.items(): spreadsheet_output = spreadsheet_output_class( parser=parser, @@ -147,8 +147,8 @@ def test_populated_lines(tmpdir): subsheet = Sheet(root_id="ocid") subsheet.add_field("c") parser = MockParser(["a"], {}) - parser.main_sheet.lines = [{"a": "cell1"}, {"a": "cell2"}] - subsheet.lines = [{"c": "cell3"}, {"c": "cell4"}] + parser.main_sheet._lines = [{"a": "cell1"}, {"a": "cell2"}] + subsheet._lines = [{"c": "cell3"}, {"c": "cell4"}] parser.sub_sheets["b"] = subsheet for format_name, spreadsheet_output_class in output.FORMATS.items(): spreadsheet_output = spreadsheet_output_class( @@ -206,7 +206,7 @@ def test_populated_lines(tmpdir): def test_utf8(tmpdir): parser = MockParser(["é"], {}) - parser.main_sheet.lines = [{"é": "éαГ😼𝒞人"}, {"é": "cell2"}] + parser.main_sheet._lines = [{"é": "éαГ😼𝒞人"}, {"é": "cell2"}] for format_name, spreadsheet_output_class in output.FORMATS.items(): spreadsheet_output = spreadsheet_output_class( parser=parser, diff --git a/flattentool/tests/test_xml_input.py b/flattentool/tests/test_xml_input.py index 4ab90784..d0539749 100644 --- a/flattentool/tests/test_xml_input.py +++ b/flattentool/tests/test_xml_input.py @@ -15,9 +15,8 @@ def test_xml_empty(): xml=True, id_name="iati-identifier", ) - parser.parse() assert list(parser.main_sheet) == [] - assert parser.main_sheet.lines == [] + assert list(parser.main_sheet.lines) == [] assert parser.sub_sheets == {} @@ -30,7 +29,6 @@ def test_xml_basic_example(): xml=True, id_name="iati-identifier", ) - parser.parse() assert list(parser.main_sheet) == [ "iati-identifier", "reporting-org/@ref", @@ -44,7 +42,7 @@ def test_xml_basic_example(): "activity-date/@iso-date", "activity-date/@type", ] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ { "activity-date/@type": "1", "reporting-org/narrative": "Organisation name", @@ -80,7 +78,7 @@ def test_xml_basic_example(): "transaction/0/value/@value-date", "transaction/0/value", ] - assert parser.sub_sheets["transaction"].lines == [ + assert list(parser.sub_sheets["transaction"].lines) == [ { "transaction/0/value/@value-date": "2012-01-01", "iati-identifier": "AA-AAA-123456789-ABC123", @@ -115,7 +113,7 @@ def test_xml_basic_example(): "recipient-country/0/@code", "recipient-country/0/@percentage", ] - assert parser.sub_sheets["recipient-country"].lines == [ + assert list(parser.sub_sheets["recipient-country"].lines) == [ { "iati-identifier": "AA-AAA-123456789-ABC123", "recipient-country/0/@code": "AF", @@ -148,9 +146,8 @@ def test_varyin_transaction_count(): xml=True, id_name="iati-identifier", ) - parser.parse() assert list(parser.main_sheet) == ["iati-identifier"] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ {"iati-identifier": "AA-AAA-123456789-ABC123"}, {"iati-identifier": "AA-AAA-123456789-ABC124"}, {"iati-identifier": "AA-AAA-123456789-ABC125"}, @@ -162,7 +159,7 @@ def test_varyin_transaction_count(): "transaction/0/value/@value-date", "transaction/0/value", ] - assert parser.sub_sheets["transaction"].lines == [ + assert list(parser.sub_sheets["transaction"].lines) == [ { "iati-identifier": "AA-AAA-123456789-ABC123", "transaction/0/value/@value-date": "2012-01-01", @@ -251,16 +248,15 @@ def test_list_dict_consistency(): def test_xml_whitespace(): - parser = JSONParser( - json_filename="flattentool/tests/fixtures/narrative_whitespace.xml", - root_list_path="iati-activity", - schema_parser=None, - root_id="", - xml=True, - id_name="iati-identifier", - ) - try: - parser.parse() + parser = JSONParser( + json_filename="flattentool/tests/fixtures/narrative_whitespace.xml", + root_list_path="iati-activity", + schema_parser=None, + root_id="", + xml=True, + id_name="iati-identifier", + ) + assert parser except TypeError as e: raise e diff --git a/setup.py b/setup.py index 6379e337..1202823b 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,9 @@ def run(self): "xmltodict", "lxml", "odfpy", + "zodb", + "zc.zlibstorage", + "ijson", ] setup(