hackingmaterials · sgbaird · May 19, 2021 · Nov 20, 2021 · Nov 20, 2021 · Nov 20, 2021
diff --git a/matminer/ElM2D_.py b/matminer/ElM2D_.py
@@ -0,0 +1,582 @@
+"""
+Construct ElM2D plot of a list of inorganic compostions via Element Movers Distance.
+
+Copyright (C) 2021  Cameron Hargreaves
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>
+
+--------------------------------------------------------------------------------
+
+Python Parser Source: https://github.com/Zapaan/python-chemical-formula-parser
+
+Periodic table JSON data: https://github.com/Bowserinator/Periodic-Table-JSON,
+updated to include the Pettifor number and modified Pettifor number from
+https://iopscience.iop.org/article/10.1088/1367-2630/18/9/093011
+
+Network simplex source modified to use numba from
+https://networkx.github.io/documentation/networkx-1.10/_modules/networkx/algorithms/flow/networksimplex.html#network_simplex
+
+Requires umap which may be installed via:
+    conda install -c conda-forge umap-learn
+"""
+from operator import attrgetter
+from copy import deepcopy
+
+from numba import cuda
+
+from multiprocessing import cpu_count
+
+import numpy as np
+import pickle as pk
+
+import umap
+
+from dist_matrix.njit_dist_matrix_full import dist_matrix as cpu_dist_matrix
+
+from matminer.featurizers.base import BaseFeaturizer
+
+# overriden by ElM2D class if self.target is not None
+use_cuda = cuda.is_available()
+if use_cuda:
+    target = "cuda"
+else:
+    target = "cpu"
+
+if use_cuda:
+    from dist_matrix.cuda_dist_matrix_full import dist_matrix as gpu_dist_matrix
+else:
+    gpu_dist_matrix = None
+
+
+class ElM2DFeaturizer(BaseFeaturizer):
+    """
+    Create intercompound EMD distance matrix and embedding via list of formulas.
+
+    Embedding types are:
+        - PCA
+        - UMAP
+    """
+
+    def __init__(
+        self,
+        formula_list=None,
+        n_proc=None,
+        n_components=2,
+        verbose=True,
+        chunksize=1,
+        umap_kwargs={},
+        emd_algorithm="wasserstein",
+        target=None,
+    ):
+        """
+        Initialize parameters for Element Mover's Distance.
+
+        Parameters
+        ----------
+        formula_list : list of str, optional
+            List of chemical formulas, by default None
+        n_proc : int, optional
+            Number of processors to use (deprecated), by default None
+        n_components : int, optional
+            Number of embedding dimensions, by default 2
+        verbose : bool, optional
+            Whether to output verbose information, by default True
+        chunksize : int, optional
+            Size of chunks for multiprocessing (deprecated), by default 1
+        umap_kwargs : dict, optional
+            Arguments to pass into umap_kwargs, by default {}
+        emd_algorithm : str, optional
+            How to compute the earth mover's distances, by default "wasserstein"
+        target : str, optional
+            Compute device to use: "cuda" or "cpu". If None, defaults to
+            fit() "target". If fit() target value is also None, uses "cuda"
+            if compatible GPU is available, otherwise "cpu", by default None
+        """
+        self.verbose = verbose
+
+        if n_proc is None:
+            self.n_proc = cpu_count()
+        else:
+            self.n_proc = n_proc
+
+        self.formula_list = formula_list  # Input formulae
+        # fmt: off
+        # modified pettifor scale
+        self.periodic_tab = {"D": 102, "T": 102, "H": 102, "He": 0, "Li": 11, 
+        "Be": 76, "B": 85, "C": 86, "N": 87, "O": 96, "F": 101, 
+        "Ne": 1, "Na": 10, "Mg": 72, "Al": 77, "Si": 84, "P": 88, 
+        "S": 95, "Cl": 100, "Ar": 2, "K": 9, "Ca": 15, "Sc": 47, 
+        "Ti": 50, "V": 53, "Cr": 54, "Mn": 71, "Fe": 70, "Co": 69, 
+        "Ni": 68, "Cu": 67, "Zn": 73, "Ga": 78, "Ge": 83, "As": 89, 
+        "Se": 94, "Br": 99, "Kr": 3, "Rb": 8, "Sr": 14, "Y": 20, "Zr": 48, 
+        "Nb": 52, "Mo": 55, "Tc": 58, "Ru": 60, "Rh": 62, "Pd": 64, "Ag": 66, 
+        "Cd": 74, "In": 79, "Sn": 82, "Sb": 90, "Te": 93, "I": 98, "Xe": 4, 
+        "Cs": 7, "Ba": 13, "La": 31, "Ce": 30, "Pr": 29, "Nd": 28, "Pm": 27, 
+        "Sm": 26, "Eu": 16, "Gd": 25, "Tb": 24, "Dy": 23, "Ho": 22, "Er": 21, 
+        "Tm": 19, "Yb": 17, "Lu": 18, "Hf": 49, "Ta": 51, "W": 56, "Re": 57, 
+        "Os": 59, "Ir": 61, "Pt": 63, "Au": 65, "Hg": 75, "Tl": 80, "Pb": 81, 
+        "Bi": 91, "Po": 92, "At": 97, "Rn": 5, "Fr": 6, "Ra": 12, "Ac": 32, 
+        "Th": 33, "Pa": 34, "U": 35, "Np": 36, "Pu": 37, "Am": 38, "Cm": 39, 
+        "Bk": 40, "Cf": 41, "Es": 42, "Fm": 43, "Md": 44, "No": 45, "Lr": 46, 
+        "Rf": 0, "Db": 0, "Sg": 0, "Bh": 0, "Hs": 0, "Mt": 0, "Ds": 0, "Rg": 0, 
+        "Cn": 0, "Nh": 0, "Fl": 0, "Mc": 0, "Lv": 0, "Ts": 0, "Og": 0, "Uue": 0}
+        # fmt: on
+
+        self.chunksize = chunksize
+
+        self.umap_kwargs = umap_kwargs
+
+        self.umap_kwargs["n_components"] = n_components
+        self.umap_kwargs["metric"] = "precomputed"
+
+        self.input_mat = None  # Pettifor vector representation of formula
+        self.embedder = None  # For accessing UMAP object
+        self.embedding = None  # Stores the last embedded coordinates
+        self.dm = None  # Stores distance matrix
+        self.emd_algorithm = emd_algorithm
+        self.target = target  # "cuda" or "cpu"
+
+    def fit(self, X, target=None):
+        """
+        Construct and store an ElMD distance matrix.
+
+        Take an input vector, either of a precomputed distance matrix, or
+        an iterable of strings of composition formula, construct an ElMD distance
+        matrix and store to self.dm.
+
+        Parameters
+        ----------
+        X : list of str OR 2D array
+            A list of compound formula strings, or a precomputed distance matrix. If
+            using a precomputed distance matrix, ensure self.metric == "precomputed"
+
+
+        Returns
+        -------
+        None.
+
+        """
+        self.formula_list = X
+        n = len(X)
+
+        if self.verbose:
+            print(f"Fitting {self.metric} kernel matrix")
+        if self.metric == "precomputed":
+            self.dm = X
+
+        else:
+            if self.verbose:
+                print("Constructing distances")
+            elif self.emd_algorithm == "wasserstein":
+                self.dm = self.EM2D(X, X, target=target)
+
+    def fit_transform(self, X, y=None, how="UMAP", n_components=2, target=None):
+        """
+        Successively call fit and transform.
+
+        Parameters
+        ----------
+        X : list of str
+            Compositions to embed.
+        y : 1D numerical array, optional
+            Target values to use for supervised UMAP embedding. The default is None.
+        how : str, optional
+            How to perform embedding ("UMAP" or "PCA"). The default is "UMAP".
+        n_components : int, optional
+            Number of dimensions to embed to. The default is 2.
+
+        Returns
+        -------
+        embedding : TYPE
+            DESCRIPTION.
+
+        """
+        self.fit(X, target=target)
+        embedding = self.transform(
+            how=how, n_components=self.umap_kwargs["n_components"], y=y
+        )
+        return embedding
+
+    def transform(self, how="UMAP", n_components=2, y=None):
+        """
+        Call the selected embedding method (UMAP or PCA) and embed.
+
+        Parameters
+        ----------
+        how : str, optional
+            How to perform embedding ("UMAP" or "PCA"). The default is "UMAP".
+            The default is "UMAP".
+        n_components : int, optional
+            Number of dimensions to embed to. The default is 2.
+        y : 1D numerical array, optional
+            Target values to use for supervised UMAP embedding. The default is None.
+
+        Returns
+        -------
+        2D array
+            UMAP or PCA embedding.
+
+        """
+        if self.dm is None:
+            print("No distance matrix computed, run fit() first")
+            return
+
+        n = self.umap_kwargs["n_components"]
+        if how == "UMAP":
+            if y is None:
+                if self.verbose:
+                    print(f"Constructing UMAP Embedding to {n} dimensions")
+                self.embedder = umap.UMAP(**self.umap_kwargs)
+                self.embedding = self.embedder.fit_transform(self.dm)
+
+            else:
+                y = y.to_numpy(dtype=float)
+                if self.verbose:
+                    print(
+                        f"Constructing UMAP Embedding to {n} dimensions, with \
+                            a targeted embedding"
+                    )
+                self.embedder = umap.UMAP(**self.umap_kwargs)
+                self.embedding = self.embedder.fit_transform(self.dm, y)
+
+        elif how == "PCA":
+            if self.verbose:
+                print(f"Constructing PCA Embedding to {n} dimensions")
+            self.embedding = self.PCA(n_components=self.umap_kwargs["n_components"])
+            if self.verbose:
+                print("Finished Embedding")
+
+        return self.embedding
+
+    def EM2D(self, formulas, formulas2=None, target=None):
+        """
+        Earth Mover's 2D distances. See also EMD.
+
+        Parameters
+        ----------
+        formulas : list of str
+            First list of formulas for which to compute distances. If only formulas
+            is specified, then a `pdist`-like array is returned, i.e. pairwise
+            distances within a single set.
+        formulas2 : list of str, optional
+                Second list of formulas, which if specified, causes `cdist`-like
+                behavior (i.e. pairwise distances between two sets).
+
+        Returns
+        -------
+        2D array
+            Pairwise distances.
+
+        """
+        isXY = formulas2 is None
+        # E = ElMD(metric=self.metric)
+
+        def gen_ratio_vector(comp):
+            """Create a numpy array from a composition dictionary."""
+            if isinstance(comp, str):
+                comp = self._parse_formula(comp)
+                comp = self._normalise_composition(comp)
+
+            sorted_keys = sorted(comp.keys())
+            comp_labels = [self._get_position(k) for k in sorted_keys]
+            comp_ratios = [comp[k] for k in sorted_keys]
+
+            indices = np.array(comp_labels, dtype=np.int64)
+            ratios = np.array(comp_ratios, dtype=np.float64)
+
+            numeric = np.zeros(shape=len(E.periodic_tab), dtype=np.float64)
+            numeric[indices] = ratios
+
+            return numeric
+
+        def gen_ratio_vectors(comps):
+            return np.array([gen_ratio_vector(comp) for comp in comps])
+
+        U_weights = gen_ratio_vectors(formulas)
+        if isXY:
+            V_weights = gen_ratio_vectors(formulas2)
+
+        self.lookup, self.periodic_tab = attrgetter("lookup", "periodic_tab")(E)
+
+        def get_mod_petti(x):
+            mod_petti = [
+                self.periodic_tab[self.lookup[a]] if b > 0 else 0
+                for a, b in enumerate(x)
+            ]  # FIXME: apparently might output an array of strings
+            return mod_petti
+
+        def get_mod_pettis(X):
+            # NOTE: in case output as strings, convert to float
+            mod_pettis = np.array([get_mod_petti(x) for x in X]).astype(float)
+            return mod_pettis
+
+        U = get_mod_pettis(U_weights)
+        if isXY:
+            V = get_mod_pettis(V_weights)
+
+        # decide whether to use cpu or cuda version
+        if target is None:
+            if (self.target is None or not cuda.is_available()) or self.target == "cpu":
+                target = "cpu"
+            elif self.target == "cuda" or cuda.is_available():
+                target = "cuda"
+
+        if isXY:
+            if target == "cpu":
+                distances = cpu_dist_matrix(
+                    U,
+                    V=V,
+                    U_weights=U_weights,
+                    V_weights=V_weights,
+                    metric="wasserstein",
+                )
+            elif target == "cuda":
+                distances = gpu_dist_matrix(
+                    U,
+                    V=V,
+                    U_weights=U_weights,
+                    V_weights=V_weights,
+                    metric="wasserstein",
+                )
+        else:
+            if target == "cpu":
+                distances = cpu_dist_matrix(
+                    U, U_weights=U_weights, metric="wasserstein"
+                )
+            elif target == "cuda":
+                distances = gpu_dist_matrix(
+                    U, U_weights=U_weights, metric="wasserstein"
+                )
+
+        # package
+        self.U = U
+        self.U_weights = U_weights
+
+        if isXY:
+            self.V = V
+            self.V_weights = V_weights
+
+        return distances
+
+    def PCA(self, n_components=5):
+        """
+        Perform multidimensional scaling (MDS) on a matrix of interpoint distances.
+
+        This finds a set of low dimensional points that have similar interpoint
+        distances.
+        Source: https://github.com/stober/mds/blob/master/src/mds.py
+        """
+        if self.dm == []:
+            raise Exception(
+                "No distance matrix computed, call fit_transform with a list of \
+                    compositions, or load a saved matrix with load_dm()"
+            )
+
+        (n, n) = self.dm.shape
+
+        if self.verbose:
+            print(f"Constructing {n}x{n_components} Gram matrix")
+        E = -0.5 * self.dm ** 2
+
+        # Use this matrix to get column and row means
+        Er = np.mat(np.mean(E, 1))
+        Es = np.mat(np.mean(E, 0))
+
+        # From Principles of Multivariate Analysis: A User's Perspective (page 107).
+        F = np.array(E - np.transpose(Er) - Es + np.mean(E))
+
+        if self.verbose:
+            print("Computing Eigen Decomposition")
+        [U, S, V] = np.linalg.svd(F)
+
+        Y = U * np.sqrt(S)
+
+        if self.verbose:
+            print("PCA Projected Points Computed")
+        self.mds_points = Y
+
+        return Y[:, :n_components]
+
+    def _parse(self, formula):
+        """
+        Return the molecule dict and length of parsed part.
+
+        Recurse on opening brackets to parse the subpart and
+        return on closing ones because it is the end of said subpart.
+        """
+        q = []
+        mol = {}
+        i = 0
+
+        while i < len(formula):
+            # Using a classic loop allow for manipulating the cursor
+            token = formula[i]
+
+            if token in self.CLOSERS:
+                # Check for an index for this part
+                m = re.match("\d+\.*\d*|\.\d*", formula[i + 1 :])
+                if m:
+                    weight = float(m.group(0))
+                    i += len(m.group(0))
+                else:
+                    weight = 1
+
+                submol = self._dictify(re.findall(self.ATOM_REGEX, "".join(q)))
+                return self._fuse(mol, submol, weight), i
+
+            elif token in self.OPENERS:
+                submol, l = self._parse(formula[i + 1 :])
+                mol = self._fuse(mol, submol)
+                # skip the already read submol
+                i += l + 1
+            else:
+                q.append(token)
+
+            i += 1
+
+        # Fuse in all that's left at base level
+        return (
+            self._fuse(mol, self._dictify(re.findall(self.ATOM_REGEX, "".join(q)))),
+            i,
+        )
+
+    def _parse_formula(self, formula):
+        """Parse the formula and return a dict with occurences of each atom."""
+        if not self._is_balanced(formula):
+            raise ValueError("Your brackets not matching in pairs ![{]$[&?)]}!]")
+
+        return self._parse(formula)[0]
+
+    def _normalise_composition(self, input_comp):
+        """Sum up the numbers in our counter to get total atom count."""
+        composition = deepcopy(input_comp)
+        # check it has been processed
+        if isinstance(composition, str):
+            composition = self._parse_formula(composition)
+
+        atom_count = sum(composition.values(), 0.0)
+
+        for atom in composition:
+            composition[atom] /= atom_count
+
+        return composition
+
+    def _get_position(self, element):
+        """
+        Return either the x, y coordinate of an elements position, or the
+        x-coordinate on the Pettifor numbering system as a 2-dimensional
+        """
+        keys = list(self.periodic_tab.keys())
+
+        try:
+            atomic_num = keys.index(element)
+            return atomic_num
+
+        except:
+            if self.strict_parsing:
+                raise KeyError(
+                    f"One of the elements in {self.composition} is not in the {self.metric} dictionary. Try a different representation or use strict_parsing=False"
+                )
+            else:
+                return -1
+
+    def __repr__(self):
+        """Summary of ElM2D object: length, diversity, and max distance if dm exists."""
+        if self.dm is not None:
+            return f"ElM2D(size={len(self.formula_list)},  \
+                chemical_diversity={np.mean(self.dm)} +/- {np.std(self.dm)}, \
+                    maximal_distance={np.max(self.dm)})"
+        else:
+            return "ElM2D()"
+
+    def export_dm(self, path):
+        """Export distance matrix as .csv to path."""
+        np.savetxt(path, self.dm, delimiter=",")
+
+    def import_dm(self, path):
+        """Import distance matrix from .csv file located at path."""
+        self.dm = np.loadtxt(path, delimiter=",")
+
+    def export_embedding(self, path):
+        """Export embedding as .csv file to path."""
+        np.savetxt(path, self.embedding, delimiter=",")
+
+    def import_embedding(self, path):
+        """Import embedding from .csv file located at path."""
+        self.embedding = np.loadtxt(path, delimiter=",")
+
+    def _pool_featurize(self, comp):
+        """Extract the feature vector for a given composition (comp)."""
+        return ElMD(comp, metric=self.metric).feature_vector
+
+    def featurize(self, formula_list=None, how="mean"):
+        """Featurize a list of formulas."""
+        if formula_list is None and self.formula_list is None:
+            raise Exception("You must enter a list of compositions first")
+
+        elif formula_list is None:
+            formula_list = self.formula_list
+
+        elif self.formula_list is None:
+            self.formula_list = formula_list
+
+        print(
+            f"Constructing compositionally weighted {self.metric} feature vectors \
+                for each composition"
+        )
+        vectors = map(self._pool_featurize, formula_list)
+
+        print("Complete")
+
+        return np.array(vectors)
+
+    def save(self, filepath):
+        """
+        Save all variables except for the distance matrix.
+
+        Parameters
+        ----------
+        filepath : str
+            Filepath for which to save the pickle.
+
+        Returns
+        -------
+        None.
+
+        """
+        save_dict = {k: v for k, v in self.__dict__.items()}
+        f_handle = open(filepath + ".pk", "wb")
+        pk.dump(save_dict, f_handle)
+        f_handle.close()
+
+    def load(self, filepath):
+        """
+        Load variables from pickle file.
+
+        Parameters
+        ----------
+        filepath : str
+            Filepath for which to load the pickle.
+
+        Returns
+        -------
+        None.
+
+        """
+        f_handle = open(filepath + ".pk", "rb")
+        load_dict = pk.load(f_handle)
+        f_handle.close()
+
+        for k, v in load_dict.items():
+            self.__dict__[k] = v
diff --git a/matminer/featurizers/tests/ElM2D_check.csv b/matminer/featurizers/tests/ElM2D_check.csv
diff --git a/matminer/featurizers/tests/test_ElM2D.py b/matminer/featurizers/tests/test_ElM2D.py
@@ -0,0 +1,51 @@
+"""
+Test Element Mover's 2D Distance Matrix via network simplex and "wasserstein" methods.
+
+This test ensures that the fast implementation "wasserstein" produces "close" values
+to the original network simplex method.
+"""
+import unittest
+
+from os.path import join, dirname, relpath
+
+from numpy import genfromtxt
+from numpy.testing import assert_allclose
+import pandas as pd
+
+from matminer.featurizers.ElM2D_ import ElM2DFeaturizer
+
+target = "cuda"
+
+
+class Testing(unittest.TestCase):
+    def test_dm_close(self):
+        mapper = ElM2DFeaturizer()
+        # df = pd.read_csv("train-debug.csv")
+        df = pd.read_csv(join(dirname(relpath(__file__)), "stable-mp-500.csv"))
+        formulas = df["formula"]
+        nformulas = 500
+        sub_formulas = formulas[:nformulas]
+
+        mapper.fit(sub_formulas, target=target)
+        dm_wasserstein = mapper.dm
+
+        dm_check = genfromtxt(
+            join(dirname(relpath(__file__)), "tests", "ElM2D_check.csv")
+        )
+
+        # 500 x 500 distance matrix
+        if nformulas > 500:
+            raise ValueError(
+                "nformulas>500, should be <=500 (received: {})".format(nformulas)
+            )
+
+        assert_allclose(
+            dm_wasserstein,
+            dm_check,
+            atol=1e-3,
+            err_msg="wasserstein did not match ElM2D (0.4.0) with ElMD (0-4-3).",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/matminer/utils/data.py b/matminer/utils/data.py
@@ -14,10 +14,10 @@
 import pandas as pd
 from glob import glob
 
-from pymatgen.core import Element
+from pymatgen.core.periodic_table import Element
 from pymatgen.core.periodic_table import _pt_data
 
-__author__ = 'Kiran Mathew, Jiming Chen, Logan Ward, Anubhav Jain, Alex Dunn'
+__author__ = "Kiran Mathew, Jiming Chen, Logan Ward, Anubhav Jain, Alex Dunn"
 
 module_dir = os.path.dirname(os.path.abspath(__file__))
 
@@ -94,9 +94,9 @@ def get_charge_dependent_property_from_specie(self, specie, property_name):
             (float) - Value of property
         """
 
-        return self.get_charge_dependent_property(specie.element,
-                                                  specie.oxi_state,
-                                                  property_name)
+        return self.get_charge_dependent_property(
+            specie.element, specie.oxi_state, property_name
+        )
 
 
 class CohesiveEnergyData(AbstractData):
@@ -110,11 +110,12 @@ class CohesiveEnergyData(AbstractData):
 
     def __init__(self):
         # Load elemental cohesive energy data from json file
-        with open(os.path.join(module_dir, 'data_files',
-                               'cohesive_energies.json'), 'r') as f:
+        with open(
+            os.path.join(module_dir, "data_files", "cohesive_energies.json"), "r"
+        ) as f:
             self.cohesive_energy_data = json.load(f)
 
-    def get_elemental_property(self, elem, property_name='cohesive energy'):
+    def get_elemental_property(self, elem, property_name="cohesive energy"):
         """
         Args:
             elem: (Element) Element of interest
@@ -140,10 +141,16 @@ class DemlData(OxidationStateDependentData, OxidationStatesMixin):
 
     def __init__(self):
         from matminer.utils.data_files.deml_elementdata import properties
+
         self.all_props = properties
-        self.available_props = list(self.all_props.keys()) + \
-                               ["formal_charge", "valence_s", "valence_p",
-                                "valence_d", "first_ioniz", "total_ioniz"]
+        self.available_props = list(self.all_props.keys()) + [
+            "formal_charge",
+            "valence_s",
+            "valence_p",
+            "valence_d",
+            "first_ioniz",
+            "total_ioniz",
+        ]
 
         # Compute the FERE correction energy
         fere_corr = {}
@@ -152,13 +159,18 @@ def __init__(self):
         self.all_props["FERE correction"] = fere_corr
 
         # List out the available charge-dependent properties
-        self.charge_dependent_properties = ["xtal_field_split", "magn_moment",
-                                            "so_coupling", "sat_magn"]
+        self.charge_dependent_properties = [
+            "xtal_field_split",
+            "magn_moment",
+            "so_coupling",
+            "sat_magn",
+        ]
 
     def get_elemental_property(self, elem, property_name):
         if "valence" in property_name:
             valence_dict = self.all_props["valence_e"][
-                self.all_props["col_num"][elem.symbol]]
+                self.all_props["col_num"][elem.symbol]
+            ]
             if property_name[-1] in ["s", "p", "d"]:
                 # Return one of the shells
                 return valence_dict[property_name[-1]]
@@ -175,12 +187,14 @@ def get_oxidation_states(self, elem):
     def get_charge_dependent_property(self, element, charge, property_name):
         if property_name == "total_ioniz":
             if charge < 0:
-                raise ValueError(
-                    "total ionization energy only defined for charge > 0")
+                raise ValueError("total ionization energy only defined for charge > 0")
             return sum(self.all_props["ionization_en"][element.symbol][:charge])
         else:
-            return self.all_props[property_name].get(element.symbol, {}).get(
-                charge, np.nan)
+            return (
+                self.all_props[property_name]
+                .get(element.symbol, {})
+                .get(charge, np.nan)
+            )
 
 
 class MagpieData(AbstractData, OxidationStatesMixin):
@@ -198,32 +212,32 @@ class MagpieData(AbstractData, OxidationStatesMixin):
     def __init__(self):
         self.all_elemental_props = dict()
         available_props = []
-        self.data_dir = os.path.join(module_dir, "data_files",
-                                     'magpie_elementdata')
+        self.data_dir = os.path.join(module_dir, "data_files", "magpie_elementdata")
 
         # Make a list of available properties
         for datafile in glob(os.path.join(self.data_dir, "*.table")):
-            available_props.append(
-                os.path.basename(datafile).replace('.table', ''))
+            available_props.append(os.path.basename(datafile).replace(".table", ""))
 
         # parse and store elemental properties
         for descriptor_name in available_props:
-            with open(os.path.join(self.data_dir,
-                                   '{}.table'.format(descriptor_name)),
-                      'r') as f:
+            with open(
+                os.path.join(self.data_dir, "{}.table".format(descriptor_name)), "r"
+            ) as f:
                 self.all_elemental_props[descriptor_name] = dict()
                 lines = f.readlines()
                 for atomic_no in range(1, len(_pt_data) + 1):  # max Z=103
                     try:
                         if descriptor_name in ["OxidationStates"]:
-                            prop_value = [float(i) for i in
-                                          lines[atomic_no - 1].split()]
+                            prop_value = [
+                                float(i) for i in lines[atomic_no - 1].split()
+                            ]
                         else:
                             prop_value = float(lines[atomic_no - 1])
                     except (ValueError, IndexError):
                         prop_value = float("NaN")
                     self.all_elemental_props[descriptor_name][
-                        Element.from_Z(atomic_no).symbol] = prop_value
+                        Element.from_Z(atomic_no).symbol
+                    ] = prop_value
 
     def get_elemental_property(self, elem, property_name):
         return self.all_elemental_props[property_name][elem.symbol]
@@ -263,9 +277,12 @@ def get_oxidation_states(self, elem):
                 or all known oxidation states
         Returns:
             [int] list of oxidation states
-            """
-        return elem.common_oxidation_states if self.use_common_oxi_states \
+        """
+        return (
+            elem.common_oxidation_states
+            if self.use_common_oxi_states
             else elem.oxidation_states
+        )
 
     def get_charge_dependent_property(self, element, charge, property_name):
         return getattr(element, property_name)[charge]
@@ -288,21 +305,91 @@ class MixingEnthalpy:
     """
 
     def __init__(self):
-        mixing_dataset = pd.read_csv(os.path.join(module_dir, 'data_files',
-                                                  'MiedemaLiquidDeltaHf.tsv'),
-                                     delim_whitespace=True)
+        mixing_dataset = pd.read_csv(
+            os.path.join(module_dir, "data_files", "MiedemaLiquidDeltaHf.tsv"),
+            delim_whitespace=True,
+        )
         self.mixing_data = {}
         for a, b, dHf in mixing_dataset.itertuples(index=False):
             key = tuple(sorted((a, b)))
             self.mixing_data[key] = dHf
         valid_elements = [
-            "Dy", "Mn", "Y", "Nd", "Ag", "Cs", "Tm", "Pd", "Sn", "Rh", "Pr",
-            "Er", "K", "In", "Tb", "Rb", "H", "N", "Ni", "Hg", "Ca", "Mo", "Li",
-            "Th", "U", "At", "Ga", "La", "Ru", "Lu", "Eu", "Si", "B", "Zr",
-            "Ce", "Pm", "Ge", "Sm", "Ta", "Ti", "Po", "Sc", "Mg", "Sr", "P",
-            "C", "Ir", "Pa", "V", "Zn", "Sb", "Na", "W", "Re", "Tl", "Pt", "Gd",
-            "Cr", "Co", "Ba", "Os", "Hf", "Pb", "Cu", "Tc", "Al", "As", "Ho",
-            "Yb", "Au", "Be", "Nb", "Cd", "Fe", "Bi"]
+            "Dy",
+            "Mn",
+            "Y",
+            "Nd",
+            "Ag",
+            "Cs",
+            "Tm",
+            "Pd",
+            "Sn",
+            "Rh",
+            "Pr",
+            "Er",
+            "K",
+            "In",
+            "Tb",
+            "Rb",
+            "H",
+            "N",
+            "Ni",
+            "Hg",
+            "Ca",
+            "Mo",
+            "Li",
+            "Th",
+            "U",
+            "At",
+            "Ga",
+            "La",
+            "Ru",
+            "Lu",
+            "Eu",
+            "Si",
+            "B",
+            "Zr",
+            "Ce",
+            "Pm",
+            "Ge",
+            "Sm",
+            "Ta",
+            "Ti",
+            "Po",
+            "Sc",
+            "Mg",
+            "Sr",
+            "P",
+            "C",
+            "Ir",
+            "Pa",
+            "V",
+            "Zn",
+            "Sb",
+            "Na",
+            "W",
+            "Re",
+            "Tl",
+            "Pt",
+            "Gd",
+            "Cr",
+            "Co",
+            "Ba",
+            "Os",
+            "Hf",
+            "Pb",
+            "Cu",
+            "Tc",
+            "Al",
+            "As",
+            "Ho",
+            "Yb",
+            "Au",
+            "Be",
+            "Nb",
+            "Cd",
+            "Fe",
+            "Bi",
+        ]
         self.valid_element_list = [Element(e) for e in valid_elements]
 
     def get_mixing_enthalpy(self, elemA, elemB):
@@ -335,8 +422,7 @@ class MatscholarElementData(AbstractData):
     """
 
     def __init__(self):
-        dfile = os.path.join(module_dir,
-                             "data_files/matscholar_els.json")
+        dfile = os.path.join(module_dir, "data_files/matscholar_els.json")
         with open(dfile, "r") as fp:
             embeddings = json.load(fp)
         self.prop_names = ["embedding {}".format(i) for i in range(1, 201)]
@@ -373,8 +459,7 @@ class MEGNetElementData(AbstractData):
     """
 
     def __init__(self):
-        dfile = os.path.join(module_dir,
-                             "data_files/megnet_elemental_embedding.json")
+        dfile = os.path.join(module_dir, "data_files/megnet_elemental_embedding.json")
         self._dummy = "Dummy"
         with open(dfile, "r") as fp:
             embeddings = json.load(fp)
@@ -441,44 +526,55 @@ def __init__(self, interpolate_soft=True):
         is usually provided for those less electronegative anions in a 9+
         oxidation state, indicating they can be used with all oxidation states.
         """
-        filepath = os.path.join(
-            module_dir,
-            "data_files",
-            "bvparm2020.cif")
-        self.params = pd.read_csv(filepath, sep='\s+',
-                                  header=None,
-                                  names=['Atom1', 'Atom1_valence',
-                                         'Atom2', 'Atom2_valence',
-                                         'Ro', 'B',
-                                         'ref_id', 'details'],
-                                  skiprows=172,
-                                  skipfooter=1,
-                                  index_col=False,
-                                  engine="python")
+        filepath = os.path.join(module_dir, "data_files", "bvparm2020.cif")
+        self.params = pd.read_csv(
+            filepath,
+            sep="\s+",
+            header=None,
+            names=[
+                "Atom1",
+                "Atom1_valence",
+                "Atom2",
+                "Atom2_valence",
+                "Ro",
+                "B",
+                "ref_id",
+                "details",
+            ],
+            skiprows=172,
+            skipfooter=1,
+            index_col=False,
+            engine="python",
+        )
         if interpolate_soft:
             self.params = self.interpolate_soft_anions()
 
     def interpolate_soft_anions(self):
         """Fill in missing parameters for oxidation states of soft anions."""
-        high_electroneg = '|'.join(['O', 'Cl', 'F'])
-        subset = self.params.loc[(self.params['Atom1_valence'] == 9) & (~self.params['Atom2'].str.contains(high_electroneg))]
-        cation_subset = subset['Atom1'].unique()
+        high_electroneg = "|".join(["O", "Cl", "F"])
+        subset = self.params.loc[
+            (self.params["Atom1_valence"] == 9)
+            & (~self.params["Atom2"].str.contains(high_electroneg))
+        ]
+        cation_subset = subset["Atom1"].unique()
         data = []
         for cation in cation_subset:
-            anions = subset.loc[subset['Atom1'] == cation]['Atom2'].unique()
+            anions = subset.loc[subset["Atom1"] == cation]["Atom2"].unique()
             for anion in anions:
-                an_val, Ro, b, ref_id = subset.loc[(subset['Atom1'] == cation)
-                        & (subset['Atom2']==anion)][['Atom2_valence', 'Ro', 'B', 'ref_id']].values[0]
+                an_val, Ro, b, ref_id = subset.loc[
+                    (subset["Atom1"] == cation) & (subset["Atom2"] == anion)
+                ][["Atom2_valence", "Ro", "B", "ref_id"]].values[0]
                 for n in range(1, 7):
-                    entry = {'Atom1': cation,
-                             'Atom1_valence': n,
-                             'Atom2': anion,
-                             'Atom2_valence': an_val,
-                             'Ro': Ro,
-                             'B': b,
-                             'ref_id': ref_id,
-                             'details': 'Interpolated'
-                            }
+                    entry = {
+                        "Atom1": cation,
+                        "Atom1_valence": n,
+                        "Atom2": anion,
+                        "Atom2_valence": an_val,
+                        "Ro": Ro,
+                        "B": b,
+                        "ref_id": ref_id,
+                        "details": "Interpolated",
+                    }
                     data.append(entry)
         new_data = pd.DataFrame(data)
         new_params = self.params.append(new_data, sort=True, ignore_index=True)
@@ -496,10 +592,11 @@ def get_bv_params(self, cation, anion, cat_val, an_val):
         """
 
         bv_data = self.params
-        bond_val_list = self.params.loc[(bv_data['Atom1'] == str(cation)) \
-                                & (bv_data['Atom1_valence'] == cat_val) \
-                                & (bv_data['Atom2'] == str(anion)) \
-                                & (bv_data['Atom2_valence'] == an_val)]
-        return bond_val_list.iloc[0] # If multiple values exist, take first one
-                                     # as recommended for reliability.
-
+        bond_val_list = self.params.loc[
+            (bv_data["Atom1"] == str(cation))
+            & (bv_data["Atom1_valence"] == cat_val)
+            & (bv_data["Atom2"] == str(anion))
+            & (bv_data["Atom2_valence"] == an_val)
+        ]
+        return bond_val_list.iloc[0]  # If multiple values exist, take first one
+        # as recommended for reliability.