Skip to content
Open
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
622d7b8
Fix param name typo in function docstring
kamurani Jul 2, 2022
8a3f3a4
add scaling node size by "rsa" feature as well as degree
kamurani Jul 2, 2022
9c9520b
add option for scaling node size by meiler embedding dimensions. Tak…
kamurani Jul 2, 2022
44a0bf8
remove walrus operator := for compatability
kamurani Jul 6, 2022
920e14e
Merge pull request #1 from a-r-j/master
kamurani Jul 7, 2022
efc69bb
Merge pull request #2 from a-r-j/master
kamurani Jul 8, 2022
a2806b6
Add type hints
a-r-j Jul 8, 2022
024e7a0
Update changelog
a-r-j Jul 8, 2022
7569751
add support for sizing nodes by RSA and colouring by hydrophobicity i…
kamurani Jul 22, 2022
66baf18
Merge remote-tracking branch 'origin/master' into graph_plots
kamurani Jul 22, 2022
3513e0c
Merge branch 'master' into graph_plots
a-r-j Jul 22, 2022
21d496a
add amino acid 3-letter code mapping to hydrophobicity scales from th…
kamurani Jul 29, 2022
2f4f0ee
add amino acid 3-letter code mapping to hydrophobicity scales from th…
kamurani Jul 29, 2022
fbac7ea
colour by hydrophobicity implemented for different scales
kamurani Jul 29, 2022
5878e4f
refactor `_node_feature` function; colour_by and size_by msupported w…
kamurani Jul 29, 2022
8948b1f
fix import statement to use graphein actual
kamurani Jul 29, 2022
1212d5e
add `hydrophobicity()`. not sure if should be passed a parameter dec…
kamurani Jul 29, 2022
6b29498
"Add a utility for getting the names of node, edge and graph attribut…
a-r-j Aug 1, 2022
00f99a5
fix edge attribute selection in util
a-r-j Aug 1, 2022
28b0ee1
add test for attribute name selection util
a-r-j Aug 1, 2022
8f22fed
use typing_extensions Literal for 3.7 support and update changelog
a-r-j Aug 2, 2022
2b69d6e
docstring, black
a-r-j Oct 23, 2022
9743332
black
a-r-j Oct 23, 2022
43bc240
fix type import; black
a-r-j Oct 23, 2022
c5e6d83
Merge branch 'master' into graph_plots
a-r-j Oct 31, 2022
f43f5e7
Merge branch 'master' into graph_plots
a-r-j Feb 16, 2023
fd1f996
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 16, 2023
37aaee8
Merge branch 'master' into graph_plots
a-r-j Mar 25, 2024
34c816b
Merge branch 'master' into graph_plots
a-r-j Oct 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -206,6 +206,8 @@
* [Bugfix] - [#208](https://github.com/a-r-j/graphein/pull/208) explicitly installs `jupyter_contrib_nbextensions` in Docker.

### 1.5.1
* [Feature] - [#197](https://github.com/a-r-j/graphein/pull/197/) adds support for sizing and colouring nodes in asteroid plots
* [Feature] - [#197](https://github.com/a-r-j/graphein/pull/197/) adds utilities for retrieving a list of graph/node/edge attribute names in `graphein.utils.utils`.

#### Protein

@@ -215,6 +217,7 @@
* [Feature] - [#189](https://github.com/a-r-j/graphein/pull/189/) adds a `residue_id` column to PDB dfs to enable easier accounting in atom graphs.
* [Feature] - [#189](https://github.com/a-r-j/graphein/pull/189/) refactors torch geometric datasets to use parallelised download for faster dataset preparation.


#### Bugfixes

* [Patch] - [#187](https://github.com/a-r-j/graphein/pull/187) updates sequence retrieval due to UniProt API changes.
42 changes: 42 additions & 0 deletions graphein/protein/features/nodes/amino_acid.py
Original file line number Diff line number Diff line change
@@ -18,6 +18,8 @@
BASE_AMINO_ACIDS,
HYDROGEN_BOND_ACCEPTORS,
HYDROGEN_BOND_DONORS,
HYDROPHOBICITY_SCALES,
HYDROPHOBICITY_TYPE,
RESI_THREE_TO_1,
)
from graphein.utils.utils import onek_encoding_unk
@@ -277,3 +279,43 @@ def hydrogen_bond_acceptor(
if not sum_features:
features = np.array(features > 0).astype(int)
d["hbond_acceptors"] = features


def hydrophobicity(
n: str,
d: Dict[str, Any],
mapping: HYDROPHOBICITY_TYPE = "kd",
return_array: bool = True,
) -> Union[np.ndarray, pd.Series]:
"""
Adds hydrophobicity values for each residue to graph nodes.
See :const:`~graphein.protein.resi_atoms.HYDROPHOBICITY_SCALES` for
values and available scales.
:param n: Node ID. Unused - kept to maintain consistent function signature.
:type n: str
:param d: Dictionary of node attributes.
:type d: Dict[str, Any]
:param mapping: Which hydrophobicity scale to use. See
:const:`~graphein.protein.resi_atoms.HYDROPHOBICITY_TYPE` for supported types.
:type mapping: graphien.protein.resi_atoms.HYDROPHOBICITY_TYPE
:param return_array: If ``True``, returns a ``np.ndarray``, otherwise returns
a ``pd.Series``. Default is ``True``.
:type return_array: bool
"""
assert (
mapping in HYDROPHOBICITY_SCALES.keys()
), f"Unsupported mapping: {mapping}. Supported mappings: {HYDROPHOBICITY_SCALES.keys()}"
hydr = HYDROPHOBICITY_SCALES[mapping]

amino_acid = d["residue_name"]
try:
features = hydr[amino_acid]
except:
features = pd.Series(np.zeros(1))

if return_array:
features = np.array(features)

d[f"hydrophobicity_{mapping}"] = features
return features
146 changes: 146 additions & 0 deletions graphein/protein/resi_atoms.py
Original file line number Diff line number Diff line change
@@ -19,6 +19,7 @@

import numpy as np
from sklearn.preprocessing import StandardScaler
from typing_extensions import Literal

BACKBONE_ATOMS: List[str] = ["N", "CA", "C", "O"]
"""Atoms present in Amino Acid Backbones."""
@@ -1103,6 +1104,151 @@
https://pubs.acs.org/doi/10.1021/j100785a001
"""

HYDROPHOBICITY_TYPE = Literal["kd", "ww", "hh", "mf", "tt"]
"""Supported hydrophobicity types. See :const:`~graphein.protein.resi_atoms.HYDROPHOBICITY_SCALES` for further details."""

HYDROPHOBICITY_SCALES: Dict[str, Dict[str, float]] = {
"kd": { # kdHydrophobicity (a)
"ILE": 4.5,
"VAL": 4.2,
"LEU": 3.8,
"PHE": 2.8,
"CYS": 2.5,
"MET": 1.9,
"ALA": 1.8,
"GLY": -0.4,
"THR": -0.7,
"SER": -0.8,
"TRP": -0.9,
"TYR": -1.3,
"PRO": -1.6,
"HIS": -3.2,
"GLU": -3.5,
"GLN": -3.5,
"ASP": -3.5,
"ASN": -3.5,
"LYS": -3.9,
"ARG": -4.5,
},
"ww": { # wwHydrophobicity (b)
"ILE": 0.31,
"VAL": -0.07,
"LEU": 0.56,
"PHE": 1.13,
"CYS": 0.24,
"MET": 0.23,
"ALA": -0.17,
"GLY": -0.01,
"THR": -0.14,
"SER": -0.13,
"TRP": 1.85,
"TYR": 0.94,
"PRO": -0.45,
"HIS": -0.96,
"GLU": -2.02,
"GLN": -0.58,
"ASP": -1.23,
"ASN": -0.42,
"LYS": -0.99,
"ARG": -0.81,
},
"hh": { # hhHydrophobicity (c)
"ILE": -0.60,
"VAL": -0.31,
"LEU": -0.55,
"PHE": -0.32,
"CYS": -0.13,
"MET": -0.10,
"ALA": 0.11,
"GLY": 0.74,
"THR": 0.52,
"SER": 0.84,
"TRP": 0.30,
"TYR": 0.68,
"PRO": 2.23,
"HIS": 2.06,
"GLU": 2.68,
"GLN": 2.36,
"ASP": 3.49,
"ASN": 2.05,
"LYS": 2.71,
"ARG": 2.58,
},
"mf": { # mfHydrophobicity (d)
"ILE": -1.56,
"VAL": -0.78,
"LEU": -1.81,
"PHE": -2.20,
"CYS": 0.49,
"MET": -0.76,
"ALA": 0.0,
"GLY": 1.72,
"THR": 1.78,
"SER": 1.83,
"TRP": -0.38,
"TYR": -1.09,
"PRO": -1.52,
"HIS": 4.76,
"GLU": 1.64,
"GLN": 3.01,
"ASP": 2.95,
"ASN": 3.47,
"LYS": 5.39,
"ARG": 3.71,
},
"tt": { # ttHydrophobicity (e)
"ILE": 1.97,
"VAL": 1.46,
"LEU": 1.82,
"PHE": 1.98,
"CYS": -0.30,
"MET": 1.40,
"ALA": 0.38,
"GLY": -0.19,
"THR": -0.32,
"SER": -0.53,
"TRP": 1.53,
"TYR": 0.49,
"PRO": -1.44,
"HIS": -1.44,
"GLU": -2.90,
"GLN": -1.84,
"ASP": -3.27,
"ASN": -1.62,
"LYS": -3.46,
"ARG": -2.57,
},
}
"""
Set of (5) dictionaries that map amino acid 3-letter codes to their hydrophobicity.
The scales included are from Chimera (UCSF) https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/hydrophob.html
and are as follows:
* kdHydrophobicity
(a) A simple method for displaying the hydropathic character of a protein. Kyte J, Doolittle RF. J Mol Biol. 1982 May 5;157(1):105-32.
https://www.ncbi.nlm.nih.gov/pubmed/7108955
* wwHydrophobicity
(b) Experimentally determined hydrophobicity scale for proteins at membrane interfaces. Wimley WC, White SH. Nat Struct Biol. 1996 Oct;3(10):842-8.
https://www.ncbi.nlm.nih.gov/pubmed/8836100
* hhHydrophobicity
(c) Recognition of transmembrane helices by the endoplasmic reticulum translocon. Hessa T, Kim H, Bihlmaier K, Lundin C, Boekel J, Andersson H, Nilsson I, White SH, von Heijne G. Nature. 2005 Jan 27;433(7024):377-81, supplementary data.
https://www.ncbi.nlm.nih.gov/pubmed/15674282
In this scale, negative values indicate greater hydrophobicity.
* mfHydrophobicity
(d) Side-chain hydrophobicity scale derived from transmembrane protein folding into lipid bilayers. Moon CP, Fleming KG. Proc Natl Acad Sci USA. 2011 Jun 21;108(25):10174-7, supplementary data.
https://www.ncbi.nlm.nih.gov/pubmed/21606332
In this scale, negative values indicate greater hydrophobicity.
* ttHydrophobicity
(e) An amino acid “transmembrane tendency” scale that approaches the theoretical limit to accuracy for prediction of transmembrane helices: relationship to biological hydrophobicity. Zhao G, London E. Protein Sci. 2006 Aug;15(8):1987-2001.
https://www.ncbi.nlm.nih.gov/pubmed/16877712
"""

ISOELECTRIC_POINTS: Dict[str, float] = {
"ALA": 6.11,
190 changes: 156 additions & 34 deletions graphein/protein/visualisation.py
Original file line number Diff line number Diff line change
@@ -9,7 +9,7 @@

import re
from itertools import count
from typing import Dict, List, Optional, Tuple, Union
from typing import Callable, Dict, List, Optional, Tuple, Union

import matplotlib
import matplotlib.pyplot as plt
@@ -22,6 +22,7 @@
from loguru import logger as log
from mpl_toolkits.mplot3d import Axes3D

from graphein.protein.resi_atoms import HYDROPHOBICITY_SCALES
from graphein.protein.subgraphs import extract_k_hop_subgraph
from graphein.utils.dependencies import import_message

@@ -47,6 +48,122 @@
log.debug(message)


"""
TODO: Functino that gets ``min`` and ``max`` values in a graph for a given feature so that we can scale / offset to > 0
TODO: should feature `distance` actually contain the site itself i.e. in the string?
"""


def _node_feature_func(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really elegant work - thanks a lot!

Having taken a closer look at the code I wonder if we can be even smarter here. Given the rest of Graphein is quite functional in style, we could perhaps refactor out the sizing functions into atomic functions & then allow users to pass in either a function of their own or a string which would dispatch one the lambdas/more complicated functions you've implemented here. I think this provides the most flexibility whilst supporting all the great features you've added here. What do you think?

g: nx.Graph,
feature: str,
focal_node: Optional[str] = None,
focal_point: Optional[tuple] = None,
no_negatives: bool = False,
) -> Callable:
"""
Maps a feature as described by a string to a function that can be applied on nodes from a graph.
:param g: Protein graph.
:type g: nx.Graph
:param feature: Name of feature to extract.
:type feature: str
:param focal_node: A specific node within ``g`` to use in feature calculation; e.g. when calculating ``distance`` to a given site.
:type focal_node: Optional[str]
:param focal_point: Use specific coordinates instead of a node within the graph.
:type focal_point: tuple
:param no_negatives: Take the max of ``0`` and the feature's value. Defaults to ``False``.
:type no_negatives: bool
:return: Function that returns a value for a given node ID.
:rtype: Callable
TODO is there a way to wrap a lambda with another function i.e. max(0, f) for `no_negatives` ?
"""
if feature == "degree":
return lambda k: g.degree[k]
elif feature in ["seq-position", "seq_position"]:
return lambda k: g.nodes(data=True)[k]["residue_number"]
elif feature == "rsa":
return lambda k: g.nodes(data=True)[k]["rsa"]
elif feature in ["bfac", "bfactor", "b_factor", "b-factor"]:
return lambda k: g.nodes(data=True)[k]["b_factor"]
elif (
feature == "distance"
): # Euclidean distance to a specific node / coordinate

def get_coords(g: nx.Graph, node: str) -> np.ndarray:
return np.array(g.nodes()[node]["coords"])

if focal_node:
assert focal_node in g.nodes()
return lambda k: np.linalg.norm(
get_coords(g, k) - get_coords(g, focal_node)
)
elif focal_point:
assert len(focal_point) == 3
return lambda k: np.linalg.norm(
get_coords(g, k) - np.array(focal_point)
)
else:
raise ValueError(
f"Node feature 'distance' requires one of `focal_node` or `focal_point`."
)

# Meiler embedding dimension
p = re.compile("meiler-?([0-9])")
match = p.search(feature)
if match:
dim = match.group(1)
if int(dim) in range(1, 8):
if no_negatives:
return lambda k: max(
0, g.nodes(data=True)[k]["meiler"][f"dim_{dim}"]
)
else:
return lambda k: g.nodes(data=True)[k]["meiler"][f"dim_{dim}"]
else:
raise ValueError(
f"Meiler embeddings have dimensions 1-7, received {dim}."
)

# Hydrophobicity
p = re.compile(
"([a-z]{2})?-?(hydrophobicity)"
) # e.g. "kd-hydrophobicity", "tthydrophobicity", "hydrophobicity"
match = p.search(feature)
if match and match.group(2):
# TODO: check if nodes actually have 'hydrophobicity' already; if they do, then use this. if not, then map to kd.
scale: str = (
match.group(1) if match.group(1) else "kd"
) # use 'kdhydrophobicity' as default if no scale specified
try:
hydrophob: Dict[str, float] = HYDROPHOBICITY_SCALES[scale]
except:
raise KeyError(f"'{scale}' not a valid hydrophobicity scale.")
return lambda k: hydrophob[k.split(":")[1]]
else:
raise NotImplementedError(f"Feature '{feature}' not implemented.")


def _node_size_func(
g: nx.Graph, feature: str, min: float, multiplier: float
) -> Callable:
"""
Returns a function that can be use to generate node sizes for plotting.
:param g: Protein graph
:type g: nx.Graph
:param feature: Name of feature to scale node sizes by.
:type feature: str
:param min: Number to offset size with.
:type min: float
:param multiplier: Number to scale feature values by.
:type multiplier: float
"""
get_feature = _node_feature_func(g=g, feature=feature, no_negatives=True)
return lambda k: min + multiplier * get_feature(k)


def plot_pointcloud(mesh: Meshes, title: str = "") -> Axes3D:
"""
Plots pytorch3d Meshes object as pointcloud.
@@ -255,27 +372,12 @@ def plotly_protein_structure_graph(
G, colour_map=edge_color_map, colour_by=colour_edges_by
)

# Get node size
def node_scale_by(G: nx.Graph, feature: str):
if feature == "degree":
return lambda k: node_size_min + node_size_multiplier * G.degree[k]
elif feature == "rsa":
return (
lambda k: node_size_min
+ node_size_multiplier * G.nodes(data=True)[k]["rsa"]
)

# Meiler embedding dimension
p = re.compile("meiler-([1-7])")
dim = p.search(feature)[1]
if dim:
return lambda k: node_size_min + node_size_multiplier * max(
0, G.nodes(data=True)[k]["meiler"][f"dim_{dim}"]
) # Meiler values may be negative
else:
raise ValueError(f"Cannot size nodes by feature '{feature}'")

get_node_size = node_scale_by(G, node_size_feature)
size_by = _node_size_func(
G,
node_size_feature,
min=node_size_min,
multiplier=node_size_multiplier,
)

# 3D network plot
x_nodes = []
@@ -289,7 +391,7 @@ def node_scale_by(G: nx.Graph, feature: str):
x_nodes.append(value[0])
y_nodes.append(value[1])
z_nodes.append(value[2])
node_sizes.append(get_node_size(key))
node_sizes.append(size_by(key))

if label_node_ids:
node_labels.append(list(G.nodes())[i])
@@ -736,6 +838,7 @@ def asteroid_plot(
node_id: str,
k: int = 2,
colour_nodes_by: str = "shell", # residue_name
size_nodes_by: str = "degree",
colour_edges_by: str = "kind",
edge_colour_map: plt.cm.Colormap = plt.cm.plasma,
edge_alpha: float = 1.0,
@@ -746,6 +849,8 @@ def asteroid_plot(
use_plotly: bool = True,
show_edges: bool = False,
show_legend: bool = True,
node_size_min: float = 20,
node_size_multiplier: float = 10,
node_size_multiplier: float = 10.0,
) -> Union[plotly.graph_objects.Figure, matplotlib.figure.Figure]:
# sourcery skip: remove-unnecessary-else, swap-if-else-branches
@@ -763,6 +868,8 @@ def asteroid_plot(
:param colour_nodes_by: Colour the nodes by this attribute. Currently only
``"shell"`` is supported.
:type colour_nodes_by: str
:param size_nodes_by: Size the nodes by an attribute.
:type size_nodes_by: str
:param colour_edges_by: Colour the edges by this attribute. Currently only
``"kind"`` is supported.
:type colour_edges_by: str
@@ -784,6 +891,8 @@ def asteroid_plot(
:param show_legend: Whether to show the legend of the edges. Defaults to
`True``.
:type show_legend: bool
:param node_size_min: Specifies node minimum size. Defaults to ``20.0``.
:type node_size_min: float
:param node_size_multiplier: Multiplier for the size of the nodes. Defaults
to ``10.0``.
:type node_size_multiplier: float
@@ -845,21 +954,34 @@ def asteroid_plot(
node_x.append(x)
node_y.append(y)

degrees = [
subgraph.degree(n) * node_size_multiplier for n in subgraph.nodes()
]
size_by = _node_size_func(
subgraph,
size_nodes_by,
min=node_size_min,
multiplier=node_size_multiplier,
)
node_sizes = [size_by(n) for n in subgraph.nodes()]

colour_nodes_by = colour_nodes_by.lower()
node_colours = []
if colour_nodes_by == "shell":
node_colours = []
for n in subgraph.nodes():
for k, v in nodes.items():
if n in v:
node_colours.append(k)
else:
raise NotImplementedError(
f"Colour by {colour_nodes_by} not implemented."
)
# TODO colour by AA type
try:
get_feature = _node_feature_func(
g=subgraph, feature=colour_nodes_by, no_negatives=False
)
except:
raise NotImplementedError(
f"Colour by {colour_nodes_by} not implemented."
)

for n, d in subgraph.nodes(data=True):
node_colours.append(get_feature(n))

node_trace = go.Scatter(
x=node_x,
y=node_y,
@@ -869,13 +991,13 @@ def asteroid_plot(
textposition="bottom center",
showlegend=False,
marker=dict(
colorscale="YlGnBu",
colorscale="viridis",
reversescale=True,
color=node_colours,
size=degrees,
size=node_sizes,
colorbar=dict(
thickness=15,
title="Shell",
title=str.capitalize(colour_nodes_by),
tickvals=list(range(k)),
xanchor="left",
titleside="right",
42 changes: 42 additions & 0 deletions graphein/utils/utils.py
Original file line number Diff line number Diff line change
@@ -357,6 +357,48 @@ def ping(host: str) -> bool:
return subprocess.call(command) == 0


def get_node_attribute_names(g: nx.Graph) -> List[str]:
"""Returns a list of node attribute names present within a graph.
:param g: Networkx Graph.
:type g: nx.Graph
:returns: List of node attribute names
:rtype: List[str]
"""

return list(
set(np.array([list(g.nodes[n].keys()) for n in g.nodes()]).flatten())
)


def get_edge_attribute_names(g: nx.Graph) -> List[str]:
"""Returns a list of edge attribute names present within a graph.
:param g: Networkx Graph.
:type g: nx.Graph
:returns: List of edge attribute names
:rtype: List[str]
"""
return list(
set(
np.array(
[list(g.edges[u, v].keys()) for u, v in g.edges()]
).flatten()
)
)


def get_graph_attribute_names(g: nx.Graph) -> List[str]:
"""Returns a list of graph attribute names present within a graph.
:param g: Networkx Graph.
:type g: nx.Graph
:returns: List of graph attribute names
:rtype: List[str]
"""
return list(g.graph.keys())


def parse_aggregation_type(
aggregation_type: Literal["sum", "mean", "max", "min", "median"]
) -> Callable:
57 changes: 57 additions & 0 deletions tests/utils/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Tests for graphein.utils.utils"""

from graphein.protein.graphs import construct_graph
from graphein.utils.utils import (
get_edge_attribute_names,
get_graph_attribute_names,
get_node_attribute_names,
)


def test_get_graph_attribute_names():
g = construct_graph(pdb_code="3eiy")
DEFAULT_ATTRS = [
"name",
"pdb_code",
"pdb_path",
"chain_ids",
"pdb_df",
"raw_pdb_df",
"rgroup_df",
"coords",
"node_type",
"sequence_A",
"config",
"dist_mat",
]
graph_attrs = get_graph_attribute_names(g)
assert set(graph_attrs) == set(
DEFAULT_ATTRS
), "Graph attributes do not match expected attributes."


def test_get_node_attribute_names():
g = construct_graph(pdb_code="3eiy")
DEFAULT_ATTRS = [
"chain_id",
"residue_name",
"residue_number",
"atom_type",
"element_symbol",
"coords",
"b_factor",
"meiler",
]
node_attrs = get_node_attribute_names(g)
assert set(node_attrs) == set(
DEFAULT_ATTRS
), "Node attributes do not match expected attributes."


def test_get_edge_attribute_names():
g = construct_graph(pdb_code="3eiy")
DEFAULT_ATTRS = ["kind", "distance"]
edge_attrs = get_edge_attribute_names(g)
assert set(edge_attrs) == set(
DEFAULT_ATTRS
), "Edge attributes do not match expected attributes."