Skip to content

Commit

Permalink
implement tablify transformer
Browse files Browse the repository at this point in the history
  • Loading branch information
gschnabel committed Sep 5, 2022
1 parent 5866b50 commit 6079426
Show file tree
Hide file tree
Showing 3 changed files with 163 additions and 1 deletion.
1 change: 1 addition & 0 deletions exfor_parserpy/trafos/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .uncommonfy import uncommonfy
from .depointerfy import depointerfy
from .detextify import detextify
from .tablify import tablify
147 changes: 147 additions & 0 deletions exfor_parserpy/trafos/tablify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
############################################################
#
# Author(s): Georg Schnabel
# Email: [email protected]
# Creation date: 2022/09/04
# Last modified: 2022/09/05
# License: MIT
# Copyright (c) 2022 International Atomic Energy Agency (IAEA)
#
############################################################
import pandas as pd
import numpy as np
import re
from ..utils.convenience import is_dic, contains_pointers, is_subentry
from ..utils.custom_iterators import exfor_iterator3


def tablify(exfor_dic, sep=".", pointersep="#", keep_toplevel=False):
"""Convert EXFOR entry to table."""
# first traverse the nested dictionary and locate
# all the subentries. Retrieve tuples of column
# names and content from them.
df_list = []
outeriter = exfor_iterator3(exfor_dic, filterfun=is_subentry)
for subentid, subent, parent_of_subent in outeriter:
if not is_subentry(subent, subentid):
continue
cur_table_dic = {}
coliter = column_iterator(subent, sep=sep, pointersep=pointersep)
for col, cont in coliter:
tcol, tcont = column_transformer(col, cont, sep, pointersep, keep_toplevel)
if tcol is not None:
cur_table_dic[tcol] = tcont
try:
curdf = pd.DataFrame.from_dict(cur_table_dic)
except ValueError:
# if all fields in cur_table_dic are scalars
# the above instruction will fail and we need
# to do it like this:
curdf = pd.DataFrame([cur_table_dic])
df_list.append(curdf)

df = pd.concat(df_list, ignore_index=True)
# merge the information of the first subentry into all
# subsequent subentries and remove the rows related
# to the first subentry from the dataframe
df_list = []
df["auxgroup"] = df.SUBENTRY.str.slice(5, 8)
first_subent_df = df[df.auxgroup == "001"].drop("auxgroup", axis=1)
other_subent_df = df[df.auxgroup != "001"].drop("auxgroup", axis=1)
other_subent_df.set_index(["ENTRY", "SUBENTRY"], inplace=True)
first_subent_df.set_index(["ENTRY", "SUBENTRY"], inplace=True)

entry_groups = other_subent_df.groupby(by=["ENTRY"])
for cur_group in entry_groups:
curdf = cur_group[1].dropna(axis=1)
curdf.reset_index(inplace=True)
try:
firstsub = first_subent_df.loc[cur_group[0]].copy()
firstsub.dropna(axis=1, inplace=True)
nrep = len(curdf) / len(firstsub)
firstsub = pd.DataFrame(
np.repeat(firstsub.values, nrep, axis=0), columns=firstsub.columns
)
curdf = pd.concat([firstsub, curdf], axis=1)
except KeyError:
# we did not find a corresponding first subentry
# so nothing to do as curdf is already assigned
pass
df_list.append(curdf)

ret_df = pd.concat(df_list, axis=0, ignore_index=True)
# move entry and subentry column to front
first_cols = ["ENTRY", "SUBENTRY"]
ret_df = ret_df[first_cols + [x for x in ret_df.columns if x not in first_cols]]
return ret_df


def column_iterator(elem, path=None, sep=".", pointersep="#"):
# NOTE: the contains_pointers function indicates true if there are
# only keys consisting of a single letter. In the DATA and
# COMMON subdictionary this heuristic can fail if there is
# only one quantity present which is the outgoing energy indicated by E.
# Therefore we special case this possibility.
no_pointer = False
if path is not None and len(path) >= 2:
no_pointer = path[-2:] in (
("DATA", "DATA"),
("DATA", "UNIT"),
("COMMON", "DATA"),
("COMMON", "UNIT"),
)

has_pointers = contains_pointers(elem) and not no_pointer
if is_dic(elem) and not has_pointers:
for key, item in elem.items():
if path is None:
curpath = (key,)
else:
curpath = path + (key,)
yield from column_iterator(item, curpath, sep, pointersep)
elif has_pointers:
for key, item in elem.items():
pathstr = sep.join(path) + pointersep + key
yield pathstr, item
else:
pathstr = sep.join(path)
yield pathstr, elem


def column_transformer(colname, content, sep, pointersep, keep_toplevel=False):
path = colname.split(sep)
if "BIB" in path:
startpos = path.index("BIB")
if not keep_toplevel:
startpos += 1
path = sep.join(path[startpos:])
return path, content

elif "COMMON" in path:
startpos = path.index("COMMON")
if path[startpos + 1] == "UNIT":
return None, content
elif path[startpos + 1] == "DATA":
path = path[: startpos + 1] + path[startpos + 2 :]
if not keep_toplevel:
startpos += 1
path = sep.join(path[startpos:])
return path, content

elif "DATA" in path:
startpos = path.index("DATA")
if path[startpos + 1] == "DATA":
if not keep_toplevel:
startpos += 1
path = sep.join(path[startpos + 1 :])
return path, content
elif path[startpos + 1] == "UNIT":
return None, content

elif path[-1] == "__entryid":
return "ENTRY", content

elif path[-1] == "__subentid":
return "SUBENTRY", content

return colname, content
16 changes: 15 additions & 1 deletion tests/test_transformers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pathlib import Path
import pytest
from exfor_parserpy import read_exfor
from exfor_parserpy.trafos import unitfy, depointerfy, uncommonfy, detextify
from exfor_parserpy.trafos import unitfy, depointerfy, uncommonfy, detextify, tablify


def test_unitfy_never_fails(entry_file):
Expand Down Expand Up @@ -34,3 +34,17 @@ def test_uncommonfy_never_fails(entry_file):
uncommonfy(content)
except Exception as exc:
assert False, f"uncommonfy failed on file {entry_file} with exception {exc}"


def test_tablify_never_fails(entry_file):
content = read_exfor(entry_file)
entryid = tuple(content.keys())[0]
if len(content[entryid]) == 1 and tuple(content[entryid].keys())[0].endswith("001"):
# if there is only the first subentry present,
# tablify is expected to fail
assert True
return
try:
tablify(content)
except Exception as exc:
assert False, f"tablify failed on file {entry_file} with exception {exc}"

0 comments on commit 6079426

Please sign in to comment.