a-r-j · kamurani · Jul 2, 2022 · Jul 2, 2022 · Jul 2, 2022 · Jul 6, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -206,6 +206,8 @@
 * [Bugfix] - [#208](https://github.com/a-r-j/graphein/pull/208) explicitly installs `jupyter_contrib_nbextensions` in Docker.
 
 ### 1.5.1
+* [Feature] - [#197](https://github.com/a-r-j/graphein/pull/197/) adds support for sizing and colouring nodes in asteroid plots
+* [Feature] - [#197](https://github.com/a-r-j/graphein/pull/197/) adds utilities for retrieving a list of graph/node/edge attribute names in `graphein.utils.utils`.
 
 #### Protein
 
@@ -215,6 +217,7 @@
 * [Feature] - [#189](https://github.com/a-r-j/graphein/pull/189/) adds a `residue_id` column to PDB dfs to enable easier accounting in atom graphs.
 * [Feature] - [#189](https://github.com/a-r-j/graphein/pull/189/) refactors torch geometric datasets to use parallelised download for faster dataset preparation.
 
+
 #### Bugfixes
 
 * [Patch] - [#187](https://github.com/a-r-j/graphein/pull/187) updates sequence retrieval due to UniProt API changes.

diff --git a/graphein/protein/features/nodes/amino_acid.py b/graphein/protein/features/nodes/amino_acid.py
@@ -18,6 +18,8 @@
     BASE_AMINO_ACIDS,
     HYDROGEN_BOND_ACCEPTORS,
     HYDROGEN_BOND_DONORS,
+    HYDROPHOBICITY_SCALES,
+    HYDROPHOBICITY_TYPE,
     RESI_THREE_TO_1,
 )
 from graphein.utils.utils import onek_encoding_unk
@@ -277,3 +279,43 @@ def hydrogen_bond_acceptor(
     if not sum_features:
         features = np.array(features > 0).astype(int)
     d["hbond_acceptors"] = features
+
+
+def hydrophobicity(
+    n: str,
+    d: Dict[str, Any],
+    mapping: HYDROPHOBICITY_TYPE = "kd",
+    return_array: bool = True,
+) -> Union[np.ndarray, pd.Series]:
+    """
+    Adds hydrophobicity values for each residue to graph nodes.
+    See :const:`~graphein.protein.resi_atoms.HYDROPHOBICITY_SCALES` for
+    values and available scales.
+
+    :param n: Node ID. Unused - kept to maintain consistent function signature.
+    :type n: str
+    :param d: Dictionary of node attributes.
+    :type d: Dict[str, Any]
+    :param mapping: Which hydrophobicity scale to use. See
+        :const:`~graphein.protein.resi_atoms.HYDROPHOBICITY_TYPE` for supported types.
+    :type mapping: graphien.protein.resi_atoms.HYDROPHOBICITY_TYPE
+    :param return_array: If ``True``, returns a ``np.ndarray``, otherwise returns
+        a ``pd.Series``. Default is ``True``.
+    :type return_array: bool
+    """
+    assert (
+        mapping in HYDROPHOBICITY_SCALES.keys()
+    ), f"Unsupported mapping: {mapping}. Supported mappings: {HYDROPHOBICITY_SCALES.keys()}"
+    hydr = HYDROPHOBICITY_SCALES[mapping]
+
+    amino_acid = d["residue_name"]
+    try:
+        features = hydr[amino_acid]
+    except:
+        features = pd.Series(np.zeros(1))
+
+    if return_array:
+        features = np.array(features)
+
+    d[f"hydrophobicity_{mapping}"] = features
+    return features
diff --git a/graphein/protein/resi_atoms.py b/graphein/protein/resi_atoms.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 from sklearn.preprocessing import StandardScaler
+from typing_extensions import Literal
 
 BACKBONE_ATOMS: List[str] = ["N", "CA", "C", "O"]
 """Atoms present in Amino Acid Backbones."""
@@ -1103,6 +1104,151 @@
 https://pubs.acs.org/doi/10.1021/j100785a001
 """
 
+HYDROPHOBICITY_TYPE = Literal["kd", "ww", "hh", "mf", "tt"]
+"""Supported hydrophobicity types. See :const:`~graphein.protein.resi_atoms.HYDROPHOBICITY_SCALES` for further details."""
+
+HYDROPHOBICITY_SCALES: Dict[str, Dict[str, float]] = {
+    "kd": {  # kdHydrophobicity (a)
+        "ILE": 4.5,
+        "VAL": 4.2,
+        "LEU": 3.8,
+        "PHE": 2.8,
+        "CYS": 2.5,
+        "MET": 1.9,
+        "ALA": 1.8,
+        "GLY": -0.4,
+        "THR": -0.7,
+        "SER": -0.8,
+        "TRP": -0.9,
+        "TYR": -1.3,
+        "PRO": -1.6,
+        "HIS": -3.2,
+        "GLU": -3.5,
+        "GLN": -3.5,
+        "ASP": -3.5,
+        "ASN": -3.5,
+        "LYS": -3.9,
+        "ARG": -4.5,
+    },
+    "ww": {  # wwHydrophobicity (b)
+        "ILE": 0.31,
+        "VAL": -0.07,
+        "LEU": 0.56,
+        "PHE": 1.13,
+        "CYS": 0.24,
+        "MET": 0.23,
+        "ALA": -0.17,
+        "GLY": -0.01,
+        "THR": -0.14,
+        "SER": -0.13,
+        "TRP": 1.85,
+        "TYR": 0.94,
+        "PRO": -0.45,
+        "HIS": -0.96,
+        "GLU": -2.02,
+        "GLN": -0.58,
+        "ASP": -1.23,
+        "ASN": -0.42,
+        "LYS": -0.99,
+        "ARG": -0.81,
+    },
+    "hh": {  # hhHydrophobicity (c)
+        "ILE": -0.60,
+        "VAL": -0.31,
+        "LEU": -0.55,
+        "PHE": -0.32,
+        "CYS": -0.13,
+        "MET": -0.10,
+        "ALA": 0.11,
+        "GLY": 0.74,
+        "THR": 0.52,
+        "SER": 0.84,
+        "TRP": 0.30,
+        "TYR": 0.68,
+        "PRO": 2.23,
+        "HIS": 2.06,
+        "GLU": 2.68,
+        "GLN": 2.36,
+        "ASP": 3.49,
+        "ASN": 2.05,
+        "LYS": 2.71,
+        "ARG": 2.58,
+    },
+    "mf": {  # mfHydrophobicity (d)
+        "ILE": -1.56,
+        "VAL": -0.78,
+        "LEU": -1.81,
+        "PHE": -2.20,
+        "CYS": 0.49,
+        "MET": -0.76,
+        "ALA": 0.0,
+        "GLY": 1.72,
+        "THR": 1.78,
+        "SER": 1.83,
+        "TRP": -0.38,
+        "TYR": -1.09,
+        "PRO": -1.52,
+        "HIS": 4.76,
+        "GLU": 1.64,
+        "GLN": 3.01,
+        "ASP": 2.95,
+        "ASN": 3.47,
+        "LYS": 5.39,
+        "ARG": 3.71,
+    },
+    "tt": {  # ttHydrophobicity (e)
+        "ILE": 1.97,
+        "VAL": 1.46,
+        "LEU": 1.82,
+        "PHE": 1.98,
+        "CYS": -0.30,
+        "MET": 1.40,
+        "ALA": 0.38,
+        "GLY": -0.19,
+        "THR": -0.32,
+        "SER": -0.53,
+        "TRP": 1.53,
+        "TYR": 0.49,
+        "PRO": -1.44,
+        "HIS": -1.44,
+        "GLU": -2.90,
+        "GLN": -1.84,
+        "ASP": -3.27,
+        "ASN": -1.62,
+        "LYS": -3.46,
+        "ARG": -2.57,
+    },
+}
+"""
+Set of (5) dictionaries that map amino acid 3-letter codes to their hydrophobicity.
+
+The scales included are from Chimera (UCSF) https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/hydrophob.html
+and are as follows:
+
+    * kdHydrophobicity
+        (a) A simple method for displaying the hydropathic character of a protein. Kyte J, Doolittle RF. J Mol Biol. 1982 May 5;157(1):105-32.
+        https://www.ncbi.nlm.nih.gov/pubmed/7108955
+
+    * wwHydrophobicity
+        (b) Experimentally determined hydrophobicity scale for proteins at membrane interfaces. Wimley WC, White SH. Nat Struct Biol. 1996 Oct;3(10):842-8.
+        https://www.ncbi.nlm.nih.gov/pubmed/8836100
+
+    * hhHydrophobicity
+        (c) Recognition of transmembrane helices by the endoplasmic reticulum translocon. Hessa T, Kim H, Bihlmaier K, Lundin C, Boekel J, Andersson H, Nilsson I, White SH, von Heijne G. Nature. 2005 Jan 27;433(7024):377-81, supplementary data.
+        https://www.ncbi.nlm.nih.gov/pubmed/15674282
+
+        In this scale, negative values indicate greater hydrophobicity.
+
+    * mfHydrophobicity
+        (d)  Side-chain hydrophobicity scale derived from transmembrane protein folding into lipid bilayers. Moon CP, Fleming KG. Proc Natl Acad Sci USA. 2011 Jun 21;108(25):10174-7, supplementary data.
+        https://www.ncbi.nlm.nih.gov/pubmed/21606332
+
+        In this scale, negative values indicate greater hydrophobicity.
+
+    * ttHydrophobicity
+        (e) An amino acid “transmembrane tendency” scale that approaches the theoretical limit to accuracy for prediction of transmembrane helices: relationship to biological hydrophobicity. Zhao G, London E. Protein Sci. 2006 Aug;15(8):1987-2001.
+        https://www.ncbi.nlm.nih.gov/pubmed/16877712
+"""
 
 ISOELECTRIC_POINTS: Dict[str, float] = {
     "ALA": 6.11,

diff --git a/graphein/protein/visualisation.py b/graphein/protein/visualisation.py
@@ -9,7 +9,7 @@
 
 import re
 from itertools import count
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import matplotlib
 import matplotlib.pyplot as plt
@@ -22,6 +22,7 @@
 from loguru import logger as log
 from mpl_toolkits.mplot3d import Axes3D
 
+from graphein.protein.resi_atoms import HYDROPHOBICITY_SCALES
 from graphein.protein.subgraphs import extract_k_hop_subgraph
 from graphein.utils.dependencies import import_message
 
@@ -47,6 +48,122 @@
     log.debug(message)
 
 
+"""
+TODO: Functino that gets ``min`` and ``max`` values in a graph for a given feature so that we can scale / offset to > 0
+TODO: should feature `distance` actually contain the site itself i.e. in the string?
+"""
+
+
+def _node_feature_func(
+    g: nx.Graph,
+    feature: str,
+    focal_node: Optional[str] = None,
+    focal_point: Optional[tuple] = None,
+    no_negatives: bool = False,
+) -> Callable:
+    """
+    Maps a feature as described by a string to a function that can be applied on nodes from a graph.
+
+    :param g: Protein graph.
+    :type g: nx.Graph
+    :param feature: Name of feature to extract.
+    :type feature: str
+    :param focal_node: A specific node within ``g`` to use in feature calculation; e.g. when calculating ``distance`` to a given site.
+    :type focal_node: Optional[str]
+    :param focal_point: Use specific coordinates instead of a node within the graph.
+    :type focal_point: tuple
+    :param no_negatives: Take the max of ``0`` and the feature's value. Defaults to ``False``.
+    :type no_negatives: bool
+    :return: Function that returns a value for a given node ID.
+    :rtype: Callable
+
+    TODO is there a way to wrap a lambda with another function i.e. max(0, f) for `no_negatives` ?
+    """
+    if feature == "degree":
+        return lambda k: g.degree[k]
+    elif feature in ["seq-position", "seq_position"]:
+        return lambda k: g.nodes(data=True)[k]["residue_number"]
+    elif feature == "rsa":
+        return lambda k: g.nodes(data=True)[k]["rsa"]
+    elif feature in ["bfac", "bfactor", "b_factor", "b-factor"]:
+        return lambda k: g.nodes(data=True)[k]["b_factor"]
+    elif (
+        feature == "distance"
+    ):  # Euclidean distance to a specific node / coordinate
+
+        def get_coords(g: nx.Graph, node: str) -> np.ndarray:
+            return np.array(g.nodes()[node]["coords"])
+
+        if focal_node:
+            assert focal_node in g.nodes()
+            return lambda k: np.linalg.norm(
+                get_coords(g, k) - get_coords(g, focal_node)
+            )
+        elif focal_point:
+            assert len(focal_point) == 3
+            return lambda k: np.linalg.norm(
+                get_coords(g, k) - np.array(focal_point)
+            )
+        else:
+            raise ValueError(
+                f"Node feature 'distance' requires one of `focal_node` or `focal_point`."
+            )
+
+    # Meiler embedding dimension
+    p = re.compile("meiler-?([0-9])")
+    match = p.search(feature)
+    if match:
+        dim = match.group(1)
+        if int(dim) in range(1, 8):
+            if no_negatives:
+                return lambda k: max(
+                    0, g.nodes(data=True)[k]["meiler"][f"dim_{dim}"]
+                )
+            else:
+                return lambda k: g.nodes(data=True)[k]["meiler"][f"dim_{dim}"]
+        else:
+            raise ValueError(
+                f"Meiler embeddings have dimensions 1-7, received {dim}."
+            )
+
+    # Hydrophobicity
+    p = re.compile(
+        "([a-z]{2})?-?(hydrophobicity)"
+    )  # e.g.  "kd-hydrophobicity", "tthydrophobicity", "hydrophobicity"
+    match = p.search(feature)
+    if match and match.group(2):
+        # TODO: check if nodes actually have 'hydrophobicity' already; if they do, then use this.  if not, then map to kd.
+        scale: str = (
+            match.group(1) if match.group(1) else "kd"
+        )  # use 'kdhydrophobicity' as default if no scale specified
+        try:
+            hydrophob: Dict[str, float] = HYDROPHOBICITY_SCALES[scale]
+        except:
+            raise KeyError(f"'{scale}' not a valid hydrophobicity scale.")
+        return lambda k: hydrophob[k.split(":")[1]]
+    else:
+        raise NotImplementedError(f"Feature '{feature}' not implemented.")
+
+
+def _node_size_func(
+    g: nx.Graph, feature: str, min: float, multiplier: float
+) -> Callable:
+    """
+    Returns a function that can be use to generate node sizes for plotting.
+
+    :param g: Protein graph
+    :type g: nx.Graph
+    :param feature: Name of feature to scale node sizes by.
+    :type feature: str
+    :param min: Number to offset size with.
+    :type min: float
+    :param multiplier: Number to scale feature values by.
+    :type multiplier: float
+    """
+    get_feature = _node_feature_func(g=g, feature=feature, no_negatives=True)
+    return lambda k: min + multiplier * get_feature(k)
+
+
 def plot_pointcloud(mesh: Meshes, title: str = "") -> Axes3D:
     """
     Plots pytorch3d Meshes object as pointcloud.
@@ -255,27 +372,12 @@ def plotly_protein_structure_graph(
         G, colour_map=edge_color_map, colour_by=colour_edges_by
     )
 
-    # Get node size
-    def node_scale_by(G: nx.Graph, feature: str):
-        if feature == "degree":
-            return lambda k: node_size_min + node_size_multiplier * G.degree[k]
-        elif feature == "rsa":
-            return (
-                lambda k: node_size_min
-                + node_size_multiplier * G.nodes(data=True)[k]["rsa"]
-            )
-
-        # Meiler embedding dimension
-        p = re.compile("meiler-([1-7])")
-        dim = p.search(feature)[1]
-        if dim:
-            return lambda k: node_size_min + node_size_multiplier * max(
-                0, G.nodes(data=True)[k]["meiler"][f"dim_{dim}"]
-            )  # Meiler values may be negative
-        else:
-            raise ValueError(f"Cannot size nodes by feature '{feature}'")
-
-    get_node_size = node_scale_by(G, node_size_feature)
+    size_by = _node_size_func(
+        G,
+        node_size_feature,
+        min=node_size_min,
+        multiplier=node_size_multiplier,
+    )
 
     # 3D network plot
     x_nodes = []
@@ -289,7 +391,7 @@ def node_scale_by(G: nx.Graph, feature: str):
         x_nodes.append(value[0])
         y_nodes.append(value[1])
         z_nodes.append(value[2])
-        node_sizes.append(get_node_size(key))
+        node_sizes.append(size_by(key))
 
         if label_node_ids:
             node_labels.append(list(G.nodes())[i])
@@ -736,6 +838,7 @@ def asteroid_plot(
     node_id: str,
     k: int = 2,
     colour_nodes_by: str = "shell",  # residue_name
+    size_nodes_by: str = "degree",
     colour_edges_by: str = "kind",
     edge_colour_map: plt.cm.Colormap = plt.cm.plasma,
     edge_alpha: float = 1.0,
@@ -746,6 +849,8 @@ def asteroid_plot(
     use_plotly: bool = True,
     show_edges: bool = False,
     show_legend: bool = True,
+    node_size_min: float = 20,
+    node_size_multiplier: float = 10,
     node_size_multiplier: float = 10.0,
 ) -> Union[plotly.graph_objects.Figure, matplotlib.figure.Figure]:
     # sourcery skip: remove-unnecessary-else, swap-if-else-branches
@@ -763,6 +868,8 @@ def asteroid_plot(
     :param colour_nodes_by: Colour the nodes by this attribute. Currently only
         ``"shell"`` is supported.
     :type colour_nodes_by: str
+    :param size_nodes_by: Size the nodes by an attribute.
+    :type size_nodes_by: str
     :param colour_edges_by: Colour the edges by this attribute. Currently only
         ``"kind"`` is supported.
     :type colour_edges_by: str
@@ -784,6 +891,8 @@ def asteroid_plot(
     :param show_legend: Whether to show the legend of the edges. Defaults to
         `True``.
     :type show_legend: bool
+    :param node_size_min: Specifies node minimum size. Defaults to ``20.0``.
+    :type node_size_min: float
     :param node_size_multiplier: Multiplier for the size of the nodes. Defaults
         to ``10.0``.
     :type node_size_multiplier: float
@@ -845,21 +954,34 @@ def asteroid_plot(
             node_x.append(x)
             node_y.append(y)
 
-        degrees = [
-            subgraph.degree(n) * node_size_multiplier for n in subgraph.nodes()
-        ]
+        size_by = _node_size_func(
+            subgraph,
+            size_nodes_by,
+            min=node_size_min,
+            multiplier=node_size_multiplier,
+        )
+        node_sizes = [size_by(n) for n in subgraph.nodes()]
 
+        colour_nodes_by = colour_nodes_by.lower()
+        node_colours = []
         if colour_nodes_by == "shell":
-            node_colours = []
             for n in subgraph.nodes():
                 for k, v in nodes.items():
                     if n in v:
                         node_colours.append(k)
         else:
-            raise NotImplementedError(
-                f"Colour by {colour_nodes_by} not implemented."
-            )
-            # TODO colour by AA type
+            try:
+                get_feature = _node_feature_func(
+                    g=subgraph, feature=colour_nodes_by, no_negatives=False
+                )
+            except:
+                raise NotImplementedError(
+                    f"Colour by {colour_nodes_by} not implemented."
+                )
+
+            for n, d in subgraph.nodes(data=True):
+                node_colours.append(get_feature(n))
+
         node_trace = go.Scatter(
             x=node_x,
             y=node_y,
@@ -869,13 +991,13 @@ def asteroid_plot(
             textposition="bottom center",
             showlegend=False,
             marker=dict(
-                colorscale="YlGnBu",
+                colorscale="viridis",
                 reversescale=True,
                 color=node_colours,
-                size=degrees,
+                size=node_sizes,
                 colorbar=dict(
                     thickness=15,
-                    title="Shell",
+                    title=str.capitalize(colour_nodes_by),
                     tickvals=list(range(k)),
                     xanchor="left",
                     titleside="right",

diff --git a/graphein/utils/utils.py b/graphein/utils/utils.py
@@ -357,6 +357,48 @@ def ping(host: str) -> bool:
     return subprocess.call(command) == 0
 
 
+def get_node_attribute_names(g: nx.Graph) -> List[str]:
+    """Returns a list of node attribute names present within a graph.
+
+    :param g: Networkx Graph.
+    :type g: nx.Graph
+    :returns: List of node attribute names
+    :rtype: List[str]
+    """
+
+    return list(
+        set(np.array([list(g.nodes[n].keys()) for n in g.nodes()]).flatten())
+    )
+
+
+def get_edge_attribute_names(g: nx.Graph) -> List[str]:
+    """Returns a list of edge attribute names present within a graph.
+
+    :param g: Networkx Graph.
+    :type g: nx.Graph
+    :returns: List of edge attribute names
+    :rtype: List[str]
+    """
+    return list(
+        set(
+            np.array(
+                [list(g.edges[u, v].keys()) for u, v in g.edges()]
+            ).flatten()
+        )
+    )
+
+
+def get_graph_attribute_names(g: nx.Graph) -> List[str]:
+    """Returns a list of graph attribute names present within a graph.
+
+    :param g: Networkx Graph.
+    :type g: nx.Graph
+    :returns: List of graph attribute names
+    :rtype: List[str]
+    """
+    return list(g.graph.keys())
+
+
 def parse_aggregation_type(
     aggregation_type: Literal["sum", "mean", "max", "min", "median"]
 ) -> Callable:

diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
@@ -0,0 +1,57 @@
+"""Tests for graphein.utils.utils"""
+
+from graphein.protein.graphs import construct_graph
+from graphein.utils.utils import (
+    get_edge_attribute_names,
+    get_graph_attribute_names,
+    get_node_attribute_names,
+)
+
+
+def test_get_graph_attribute_names():
+    g = construct_graph(pdb_code="3eiy")
+    DEFAULT_ATTRS = [
+        "name",
+        "pdb_code",
+        "pdb_path",
+        "chain_ids",
+        "pdb_df",
+        "raw_pdb_df",
+        "rgroup_df",
+        "coords",
+        "node_type",
+        "sequence_A",
+        "config",
+        "dist_mat",
+    ]
+    graph_attrs = get_graph_attribute_names(g)
+    assert set(graph_attrs) == set(
+        DEFAULT_ATTRS
+    ), "Graph attributes do not match expected attributes."
+
+
+def test_get_node_attribute_names():
+    g = construct_graph(pdb_code="3eiy")
+    DEFAULT_ATTRS = [
+        "chain_id",
+        "residue_name",
+        "residue_number",
+        "atom_type",
+        "element_symbol",
+        "coords",
+        "b_factor",
+        "meiler",
+    ]
+    node_attrs = get_node_attribute_names(g)
+    assert set(node_attrs) == set(
+        DEFAULT_ATTRS
+    ), "Node attributes do not match expected attributes."
+
+
+def test_get_edge_attribute_names():
+    g = construct_graph(pdb_code="3eiy")
+    DEFAULT_ATTRS = ["kind", "distance"]
+    edge_attrs = get_edge_attribute_names(g)
+    assert set(edge_attrs) == set(
+        DEFAULT_ATTRS
+    ), "Edge attributes do not match expected attributes."