Skip to content

Commit

Permalink
feat(ml): graph learning algorithm impl (10+) (apache#102)
Browse files Browse the repository at this point in the history
* glcc-hugegraph-graph-ai-B

* change readme.md

* Update README.md

* Update test_examples.py

---------

Co-authored-by: Simon Cheung <[email protected]>
  • Loading branch information
MrJs133 and simon824 authored Nov 12, 2024
1 parent 1a72aa4 commit 0f609ba
Show file tree
Hide file tree
Showing 36 changed files with 6,413 additions and 18 deletions.
56 changes: 41 additions & 15 deletions hugegraph-ml/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# hugegraph-ml
# hugegraph-ml

## Summary

Expand All @@ -7,6 +7,26 @@ It implements most graph learning algorithms, enabling users to perform end-to-e
Graph data can be read directly from `HugeGraph` and used for tasks such as node embedding, node classification, and graph classification.
The implemented algorithm models can be found in the [models](./src/hugegraph_ml/models) folder.

| model | paper |
| ----------- | -------------------------------------------------- |
| AGNN | https://arxiv.org/abs/1803.03735 |
| APPNP | https://arxiv.org/abs/1810.05997 |
| ARMA | https://arxiv.org/abs/1901.01343 |
| BGNN | https://arxiv.org/abs/2101.08543 |
| BGRL | https://arxiv.org/abs/2102.06514 |
| CARE-GNN | https://arxiv.org/abs/2008.08692 |
| Cluster-GCN | https://arxiv.org/abs/1905.07953 |
| C&S | https://arxiv.org/abs/2010.13993 |
| DAGNN | https://arxiv.org/abs/2007.09296 |
| DeeperGCN | https://arxiv.org/abs/2006.07739 |
| DGI | https://arxiv.org/abs/1809.10341 |
| DiffPool | https://arxiv.org/abs/1806.08804 |
| GATNE | https://arxiv.org/abs/1905.01669 |
| GRACE | https://arxiv.org/abs/2006.04131 |
| GRAND | https://arxiv.org/abs/2005.11079 |
| JKNet | https://arxiv.org/abs/1806.03536 |
| P-GNN | http://proceedings.mlr.press/v97/you19b/you19b.pdf |
| SEAL | https://arxiv.org/abs/1802.09691 |

## Environment Requirements

Expand All @@ -16,22 +36,28 @@ The implemented algorithm models can be found in the [models](./src/hugegraph_ml
## Preparation

1. Start the HugeGraph database, you can do it via Docker/[Binary packages](https://hugegraph.apache.org/docs/download/download/).
Refer to [docker-link](https://hub.docker.com/r/hugegraph/hugegraph) & [deploy-doc](https://hugegraph.apache.org/docs/quickstart/hugegraph-server/#31-use-docker-container-convenient-for-testdev) for guidance
Refer to [docker-link](https://hub.docker.com/r/hugegraph/hugegraph) & [deploy-doc](https://hugegraph.apache.org/docs/quickstart/hugegraph-server/#31-use-docker-container-convenient-for-testdev) for guidance

2. Clone this project
```bash
git clone https://github.com/apache/incubator-hugegraph-ai.git
```
3. Install [hugegraph-python-client](../hugegraph-python-client) and [hugegraph_ml](../hugegraph-ml)
```bash
cd ./incubator-hugegraph-ai # better to use virtualenv (source venv/bin/activate)
pip install ./hugegraph-python-client
cd ./hugegraph-ml/
pip install -e .
```

```bash
git clone https://github.com/apache/incubator-hugegraph-ai.git
```

3. Install [hugegraph-python-client](../hugegraph-python-client) and [hugegraph-ml](../hugegraph-ml)

```bash
cd ./incubator-hugegraph-ai # better to use virtualenv (source venv/bin/activate)
pip install ./hugegraph-python-client
cd ./hugegraph-ml/
pip install -e .
```

4. Enter the project directory
```bash
cd ./hugegraph-ml/src
```

```bash
cd ./hugegraph-ml/src
```

## Examples

Expand Down
295 changes: 294 additions & 1 deletion hugegraph-ml/src/hugegraph_ml/data/hugegraph2dgl.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from pyhugegraph.client import PyHugeClient

from hugegraph_ml.data.hugegraph_dataset import HugeGraphDataset

import networkx as nx

class HugeGraph2DGL:
def __init__(
Expand Down Expand Up @@ -150,6 +150,132 @@ def convert_graph_dataset(
dataset_dgl = HugeGraphDataset(graphs=graphs, labels=graph_labels, info=graphs_info)
return dataset_dgl

def convert_graph_nx(
self,
vertex_label: str,
edge_label: str,
):
vertices = self._graph_germlin.exec(f"g.V().hasLabel('{vertex_label}')")["data"]
edges = self._graph_germlin.exec(f"g.E().hasLabel('{edge_label}')")["data"]
graph_nx = self._convert_graph_from_v_e_nx(vertices=vertices, edges=edges)
return graph_nx

def convert_graph_with_edge_feat(
self,
vertex_label: str,
edge_label: str,
node_feat_key: str = "feat",
edge_feat_key: str = "edge_feat",
label_key: str = "label",
mask_keys: Optional[List[str]] = None,
):
if mask_keys is None:
mask_keys = ["train_mask", "val_mask", "test_mask"]
vertices = self._graph_germlin.exec(f"g.V().hasLabel('{vertex_label}')")["data"]
edges = self._graph_germlin.exec(f"g.E().hasLabel('{edge_label}')")["data"]
graph_dgl = self._convert_graph_from_v_e_with_edge_feat(
vertices, edges, edge_feat_key, node_feat_key, label_key, mask_keys
)

return graph_dgl

def convert_graph_ogb(self, vertex_label: str, edge_label: str, split_label: str):
vertices = self._graph_germlin.exec(f"g.V().hasLabel('{vertex_label}')")["data"]
edges = self._graph_germlin.exec(f"g.E().hasLabel('{edge_label}')")["data"]
graph_dgl, vertex_id_to_idx = self._convert_graph_from_ogb(
vertices, edges, "feat", "year", "weight"
)
edges_split = self._graph_germlin.exec(f"g.E().hasLabel('{split_label}')")[
"data"
]
split_edge = self._convert_split_edge_from_ogb(edges_split, vertex_id_to_idx)
return graph_dgl, split_edge

def convert_hetero_graph_bgnn(
self,
vertex_labels: List[str],
edge_labels: List[str],
feat_key: str = "feat",
label_key: str = "class",
cat_key: str = "cat_features",
mask_keys: Optional[List[str]] = None,
):
if mask_keys is None:
mask_keys = ["train_mask", "val_mask", "test_mask"]
vertex_label_id2idx = {}
vertex_label_data = {}
# for each vertex label
for vertex_label in vertex_labels:
vertices = self._graph_germlin.exec(f"g.V().hasLabel('{vertex_label}')")[
"data"
]
if len(vertices) == 0:
warnings.warn(
f"Graph has no vertices of vertex_label: {vertex_label}", Warning
)
else:
vertex_ids = [v["id"] for v in vertices]
id2idx = {vertex_id: idx for idx, vertex_id in enumerate(vertex_ids)}
vertex_label_id2idx[vertex_label] = id2idx
# extract vertex property(feat, label, mask)
vertex_label_data[vertex_label] = {}
if feat_key in vertices[0]["properties"]:
node_feats = torch.tensor(
[v["properties"][feat_key] for v in vertices],
dtype=torch.int32,
)
vertex_label_data[vertex_label]["feat"] = node_feats
if label_key in vertices[0]["properties"]:
node_labels = torch.tensor(
[v["properties"][label_key] for v in vertices],
dtype=torch.float64,
)
vertex_label_data[vertex_label]["class"] = node_labels
if cat_key in vertices[0]["properties"]:
node_cat = torch.tensor(
[v["properties"][cat_key] for v in vertices],
dtype=torch.int32,
)
vertex_label_data[vertex_label]["cat_features"] = node_cat
if mask_keys:
for mk in mask_keys:
if mk in vertices[0]["properties"]:
mask = torch.tensor(
[v["properties"][mk] for v in vertices],
dtype=torch.bool,
)
vertex_label_data[vertex_label][mk] = mask
# build hetero graph from edges
edge_data_dict = {}
for edge_label in edge_labels:
edges = self._graph_germlin.exec(f"g.E().hasLabel('{edge_label}')")["data"]
if len(edges) == 0:
warnings.warn(
f"Graph has no edges of edge_label: {edge_label}", Warning
)
else:
src_vertex_label = edges[0]["outVLabel"]
src_idx = [
vertex_label_id2idx[src_vertex_label][e["outV"]] for e in edges
]
dst_vertex_label = edges[0]["inVLabel"]
dst_idx = [
vertex_label_id2idx[dst_vertex_label][e["inV"]] for e in edges
]
edge_data_dict[(src_vertex_label, edge_label, dst_vertex_label)] = (
src_idx,
dst_idx,
)
# add vertex properties data
hetero_graph = dgl.heterograph(edge_data_dict)
for vertex_label in vertex_labels:
for prop in vertex_label_data[vertex_label]:
hetero_graph.nodes[vertex_label].data[prop] = vertex_label_data[
vertex_label
][prop]

return hetero_graph

@staticmethod
def _convert_graph_from_v_e(vertices, edges, feat_key=None, label_key=None, mask_keys=None):
if len(vertices) == 0:
Expand All @@ -175,6 +301,154 @@ def _convert_graph_from_v_e(vertices, edges, feat_key=None, label_key=None, mask
graph_dgl.ndata[mk] = mask
return graph_dgl

@staticmethod
def _convert_graph_from_v_e_nx(vertices, edges):
if len(vertices) == 0:
warnings.warn("This graph has no vertices", Warning)
return nx.Graph(())
vertex_ids = [v["id"] for v in vertices]
vertex_id_to_idx = {vertex_id: idx for idx, vertex_id in enumerate(vertex_ids)}
new_vertex_ids = [vertex_id_to_idx[id] for id in vertex_ids]
edge_list = [(edge["outV"], edge["inV"]) for edge in edges]
new_edge_list = [
(vertex_id_to_idx[src], vertex_id_to_idx[dst]) for src, dst in edge_list
]
graph_nx = nx.Graph()
graph_nx.add_nodes_from(new_vertex_ids)
graph_nx.add_edges_from(new_edge_list)
return graph_nx

@staticmethod
def _convert_graph_from_v_e_with_edge_feat(
vertices,
edges,
edge_feat_key,
node_feat_key=None,
label_key=None,
mask_keys=None,
):
if len(vertices) == 0:
warnings.warn("This graph has no vertices", Warning)
return dgl.graph(())
vertex_ids = [v["id"] for v in vertices]
vertex_id_to_idx = {vertex_id: idx for idx, vertex_id in enumerate(vertex_ids)}
src_idx = [vertex_id_to_idx[e["outV"]] for e in edges]
dst_idx = [vertex_id_to_idx[e["inV"]] for e in edges]
graph_dgl = dgl.graph((src_idx, dst_idx))

if node_feat_key and node_feat_key in vertices[0]["properties"]:
node_feats = [v["properties"][node_feat_key] for v in vertices]
graph_dgl.ndata["feat"] = torch.tensor(node_feats, dtype=torch.int64)
if edge_feat_key and edge_feat_key in edges[0]["properties"]:
edge_feats = [e["properties"][edge_feat_key] for e in edges]
graph_dgl.edata["feat"] = torch.tensor(edge_feats, dtype=torch.int64)
if label_key and label_key in vertices[0]["properties"]:
node_labels = [v["properties"][label_key] for v in vertices]
graph_dgl.ndata["label"] = torch.tensor(node_labels, dtype=torch.long)
if mask_keys:
for mk in mask_keys:
if mk in vertices[0]["properties"]:
node_masks = [v["properties"][mk] for v in vertices]
mask = torch.tensor(node_masks, dtype=torch.bool)
graph_dgl.ndata[mk] = mask
return graph_dgl

@staticmethod
def _convert_graph_from_ogb(vertices, edges, feat_key, year_key, weight_key):
if len(vertices) == 0:
warnings.warn("This graph has no vertices", Warning)
return dgl.graph(())
vertex_ids = [v["id"] for v in vertices]
vertex_id_to_idx = {vertex_id: idx for idx, vertex_id in enumerate(vertex_ids)}
src_idx = [vertex_id_to_idx[e["outV"]] for e in edges]
dst_idx = [vertex_id_to_idx[e["inV"]] for e in edges]
graph_dgl = dgl.graph((src_idx, dst_idx))
if feat_key and feat_key in vertices[0]["properties"]:
node_feats = [
v["properties"][feat_key]
for v in vertices[0 : graph_dgl.number_of_nodes()]
]
graph_dgl.ndata["feat"] = torch.tensor(node_feats, dtype=torch.float32)
if year_key and year_key in edges[0]["properties"]:
year = [e["properties"][year_key] for e in edges]
graph_dgl.edata["year"] = torch.tensor(year, dtype=torch.int64)
if weight_key and weight_key in edges[0]["properties"]:
weight = [e["properties"][weight_key] for e in edges]
graph_dgl.edata["weight"] = torch.tensor(weight, dtype=torch.int64)

return graph_dgl, vertex_id_to_idx

@staticmethod
def _convert_split_edge_from_ogb(edges, vertex_id_to_idx):
train_edge_list = []
train_year_list = []
train_weight_list = []
valid_edge_list = []
valid_year_list = []
valid_weight_list = []
valid_edge_neg_list = []
test_edge_list = []
test_year_list = []
test_weight_list = []
test_edge_neg_list = []

for edge in edges:
if edge["properties"]["train_edge_mask"] == 1:
train_edge_list.append(
[vertex_id_to_idx[edge["outV"]], vertex_id_to_idx[edge["inV"]]]
)
if edge["properties"]["train_year_mask"] != -1:
train_year_list.append(edge["properties"]["train_year_mask"])
if edge["properties"]["train_weight_mask"] != -1:
train_weight_list.append(edge["properties"]["train_weight_mask"])

if edge["properties"]["valid_edge_mask"] == 1:
valid_edge_list.append(
[vertex_id_to_idx[edge["outV"]], vertex_id_to_idx[edge["inV"]]]
)
if edge["properties"]["valid_year_mask"] != -1:
valid_year_list.append(edge["properties"]["valid_year_mask"])
if edge["properties"]["valid_weight_mask"] != -1:
valid_weight_list.append(edge["properties"]["valid_weight_mask"])
if edge["properties"]["valid_edge_neg_mask"] == 1:
valid_edge_neg_list.append(
[vertex_id_to_idx[edge["outV"]], vertex_id_to_idx[edge["inV"]]]
)

if edge["properties"]["test_edge_mask"] == 1:
test_edge_list.append(
[vertex_id_to_idx[edge["outV"]], vertex_id_to_idx[edge["inV"]]]
)
if edge["properties"]["test_year_mask"] != -1:
test_year_list.append(edge["properties"]["test_year_mask"])
if edge["properties"]["test_weight_mask"] != -1:
test_weight_list.append(edge["properties"]["test_weight_mask"])
if edge["properties"]["test_edge_neg_mask"] == 1:
test_edge_neg_list.append(
[vertex_id_to_idx[edge["outV"]], vertex_id_to_idx[edge["inV"]]]
)

split_edge = {
"train": {
"edge": torch.tensor(train_edge_list),
"weight": torch.tensor(train_weight_list),
"year": torch.tensor(train_year_list),
},
"valid": {
"edge": torch.tensor(valid_edge_list),
"weight": torch.tensor(valid_weight_list),
"year": torch.tensor(valid_year_list),
"edge_neg": torch.tensor(valid_edge_neg_list),
},
"test": {
"edge": torch.tensor(test_edge_list),
"weight": torch.tensor(test_weight_list),
"year": torch.tensor(test_year_list),
"edge_neg": torch.tensor(test_edge_neg_list),
},
}

return split_edge

if __name__ == "__main__":
hg2d = HugeGraph2DGL()
Expand All @@ -188,3 +462,22 @@ def _convert_graph_from_v_e(vertices, edges, feat_key=None, label_key=None, mask
vertex_labels=["ACM_paper_v", "ACM_author_v", "ACM_field_v"],
edge_labels=["ACM_ap_e", "ACM_fp_e", "ACM_pa_e", "ACM_pf_e"]
)
hg2d.convert_graph_nx(vertex_label="CAVEMAN_vertex", edge_label="CAVEMAN_edge")
hg2d.convert_graph_with_edge_feat(
vertex_label="CORA_edge_feat_vertex", edge_label="CORA_edge_feat_edge"
)
hg2d.convert_graph_ogb(
vertex_label="ogbl-collab_vertex",
edge_label="ogbl-collab_edge",
split_label="ogbl-collab_split_edge",
)
hg2d.convert_hetero_graph_bgnn(
vertex_labels=["AVAZU__N_v"], edge_labels=["AVAZU__E_e"]
)
hg2d.convert_hetero_graph(
vertex_labels=["AMAZONGATNE__N_v"],
edge_labels=[
"AMAZONGATNE_1_e",
"AMAZONGATNE_2_e",
],
)
Loading

0 comments on commit 0f609ba

Please sign in to comment.