From 0f609ba9ac60d772c32a20d3a6b12058a642edf0 Mon Sep 17 00:00:00 2001 From: SoJ <102796027+MrJs133@users.noreply.github.com> Date: Tue, 12 Nov 2024 19:33:02 +0800 Subject: [PATCH] feat(ml): graph learning algorithm impl (10+) (#102) * glcc-hugegraph-graph-ai-B * change readme.md * Update README.md * Update test_examples.py --------- Co-authored-by: Simon Cheung --- hugegraph-ml/README.md | 56 +- .../src/hugegraph_ml/data/hugegraph2dgl.py | 295 ++++++- .../src/hugegraph_ml/examples/agnn_example.py | 39 + .../hugegraph_ml/examples/appnp_example.py | 43 + .../src/hugegraph_ml/examples/arma_example.py | 42 + .../src/hugegraph_ml/examples/bgnn_example.py | 67 ++ .../src/hugegraph_ml/examples/bgrl_example.py | 50 ++ .../hugegraph_ml/examples/care_gnn_example.py | 51 ++ .../examples/cluster_gcn_example.py | 37 + .../examples/correct_and_smooth_example.py | 51 ++ .../hugegraph_ml/examples/dagnn_example.py | 39 + .../examples/deepergcn_example.py | 42 + .../hugegraph_ml/examples/gatne_example.py | 46 + .../src/hugegraph_ml/examples/pgnn_example.py | 35 + .../src/hugegraph_ml/examples/seal_example.py | 50 ++ hugegraph-ml/src/hugegraph_ml/models/agnn.py | 64 ++ hugegraph-ml/src/hugegraph_ml/models/appnp.py | 84 ++ hugegraph-ml/src/hugegraph_ml/models/arma.py | 175 ++++ hugegraph-ml/src/hugegraph_ml/models/bgnn.py | 741 ++++++++++++++++ hugegraph-ml/src/hugegraph_ml/models/bgrl.py | 260 ++++++ .../src/hugegraph_ml/models/care_gnn.py | 232 +++++ .../src/hugegraph_ml/models/cluster_gcn.py | 58 ++ .../hugegraph_ml/models/correct_and_smooth.py | 262 ++++++ hugegraph-ml/src/hugegraph_ml/models/dagnn.py | 145 +++ .../src/hugegraph_ml/models/deepergcn.py | 287 ++++++ hugegraph-ml/src/hugegraph_ml/models/gatne.py | 273 ++++++ hugegraph-ml/src/hugegraph_ml/models/pgnn.py | 462 ++++++++++ hugegraph-ml/src/hugegraph_ml/models/seal.py | 826 ++++++++++++++++++ .../tasks/fraud_detector_caregnn.py | 122 +++ .../tasks/hetero_sample_embed_gatne.py | 122 +++ .../tasks/link_prediction_pgnn.py | 94 ++ .../tasks/link_prediction_seal.py | 172 ++++ .../tasks/node_classify_with_edge.py | 123 +++ .../tasks/node_classify_with_sample.py | 156 ++++ .../hugegraph_ml/utils/dgl2hugegraph_utils.py | 745 +++++++++++++++- .../src/tests/test_examples/test_examples.py | 85 +- 36 files changed, 6413 insertions(+), 18 deletions(-) create mode 100644 hugegraph-ml/src/hugegraph_ml/examples/agnn_example.py create mode 100644 hugegraph-ml/src/hugegraph_ml/examples/appnp_example.py create mode 100644 hugegraph-ml/src/hugegraph_ml/examples/arma_example.py create mode 100644 hugegraph-ml/src/hugegraph_ml/examples/bgnn_example.py create mode 100644 hugegraph-ml/src/hugegraph_ml/examples/bgrl_example.py create mode 100644 hugegraph-ml/src/hugegraph_ml/examples/care_gnn_example.py create mode 100644 hugegraph-ml/src/hugegraph_ml/examples/cluster_gcn_example.py create mode 100644 hugegraph-ml/src/hugegraph_ml/examples/correct_and_smooth_example.py create mode 100644 hugegraph-ml/src/hugegraph_ml/examples/dagnn_example.py create mode 100644 hugegraph-ml/src/hugegraph_ml/examples/deepergcn_example.py create mode 100644 hugegraph-ml/src/hugegraph_ml/examples/gatne_example.py create mode 100644 hugegraph-ml/src/hugegraph_ml/examples/pgnn_example.py create mode 100644 hugegraph-ml/src/hugegraph_ml/examples/seal_example.py create mode 100644 hugegraph-ml/src/hugegraph_ml/models/agnn.py create mode 100644 hugegraph-ml/src/hugegraph_ml/models/appnp.py create mode 100644 hugegraph-ml/src/hugegraph_ml/models/arma.py create mode 100644 hugegraph-ml/src/hugegraph_ml/models/bgnn.py create mode 100644 hugegraph-ml/src/hugegraph_ml/models/bgrl.py create mode 100644 hugegraph-ml/src/hugegraph_ml/models/care_gnn.py create mode 100644 hugegraph-ml/src/hugegraph_ml/models/cluster_gcn.py create mode 100644 hugegraph-ml/src/hugegraph_ml/models/correct_and_smooth.py create mode 100644 hugegraph-ml/src/hugegraph_ml/models/dagnn.py create mode 100644 hugegraph-ml/src/hugegraph_ml/models/deepergcn.py create mode 100644 hugegraph-ml/src/hugegraph_ml/models/gatne.py create mode 100644 hugegraph-ml/src/hugegraph_ml/models/pgnn.py create mode 100644 hugegraph-ml/src/hugegraph_ml/models/seal.py create mode 100644 hugegraph-ml/src/hugegraph_ml/tasks/fraud_detector_caregnn.py create mode 100644 hugegraph-ml/src/hugegraph_ml/tasks/hetero_sample_embed_gatne.py create mode 100644 hugegraph-ml/src/hugegraph_ml/tasks/link_prediction_pgnn.py create mode 100644 hugegraph-ml/src/hugegraph_ml/tasks/link_prediction_seal.py create mode 100644 hugegraph-ml/src/hugegraph_ml/tasks/node_classify_with_edge.py create mode 100644 hugegraph-ml/src/hugegraph_ml/tasks/node_classify_with_sample.py diff --git a/hugegraph-ml/README.md b/hugegraph-ml/README.md index 26e16ddd..c6fca7f9 100644 --- a/hugegraph-ml/README.md +++ b/hugegraph-ml/README.md @@ -1,4 +1,4 @@ - # hugegraph-ml + # hugegraph-ml ## Summary @@ -7,6 +7,26 @@ It implements most graph learning algorithms, enabling users to perform end-to-e Graph data can be read directly from `HugeGraph` and used for tasks such as node embedding, node classification, and graph classification. The implemented algorithm models can be found in the [models](./src/hugegraph_ml/models) folder. +| model | paper | +| ----------- | -------------------------------------------------- | +| AGNN | https://arxiv.org/abs/1803.03735 | +| APPNP | https://arxiv.org/abs/1810.05997 | +| ARMA | https://arxiv.org/abs/1901.01343 | +| BGNN | https://arxiv.org/abs/2101.08543 | +| BGRL | https://arxiv.org/abs/2102.06514 | +| CARE-GNN | https://arxiv.org/abs/2008.08692 | +| Cluster-GCN | https://arxiv.org/abs/1905.07953 | +| C&S | https://arxiv.org/abs/2010.13993 | +| DAGNN | https://arxiv.org/abs/2007.09296 | +| DeeperGCN | https://arxiv.org/abs/2006.07739 | +| DGI | https://arxiv.org/abs/1809.10341 | +| DiffPool | https://arxiv.org/abs/1806.08804 | +| GATNE | https://arxiv.org/abs/1905.01669 | +| GRACE | https://arxiv.org/abs/2006.04131 | +| GRAND | https://arxiv.org/abs/2005.11079 | +| JKNet | https://arxiv.org/abs/1806.03536 | +| P-GNN | http://proceedings.mlr.press/v97/you19b/you19b.pdf | +| SEAL | https://arxiv.org/abs/1802.09691 | ## Environment Requirements @@ -16,22 +36,28 @@ The implemented algorithm models can be found in the [models](./src/hugegraph_ml ## Preparation 1. Start the HugeGraph database, you can do it via Docker/[Binary packages](https://hugegraph.apache.org/docs/download/download/). -Refer to [docker-link](https://hub.docker.com/r/hugegraph/hugegraph) & [deploy-doc](https://hugegraph.apache.org/docs/quickstart/hugegraph-server/#31-use-docker-container-convenient-for-testdev) for guidance + Refer to [docker-link](https://hub.docker.com/r/hugegraph/hugegraph) & [deploy-doc](https://hugegraph.apache.org/docs/quickstart/hugegraph-server/#31-use-docker-container-convenient-for-testdev) for guidance + 2. Clone this project - ```bash - git clone https://github.com/apache/incubator-hugegraph-ai.git - ``` -3. Install [hugegraph-python-client](../hugegraph-python-client) and [hugegraph_ml](../hugegraph-ml) - ```bash - cd ./incubator-hugegraph-ai # better to use virtualenv (source venv/bin/activate) - pip install ./hugegraph-python-client - cd ./hugegraph-ml/ - pip install -e . - ``` + + ```bash + git clone https://github.com/apache/incubator-hugegraph-ai.git + ``` + +3. Install [hugegraph-python-client](../hugegraph-python-client) and [hugegraph-ml](../hugegraph-ml) + + ```bash + cd ./incubator-hugegraph-ai # better to use virtualenv (source venv/bin/activate) + pip install ./hugegraph-python-client + cd ./hugegraph-ml/ + pip install -e . + ``` + 4. Enter the project directory - ```bash - cd ./hugegraph-ml/src - ``` + + ```bash + cd ./hugegraph-ml/src + ``` ## Examples diff --git a/hugegraph-ml/src/hugegraph_ml/data/hugegraph2dgl.py b/hugegraph-ml/src/hugegraph_ml/data/hugegraph2dgl.py index 658b404f..92ea00cc 100644 --- a/hugegraph-ml/src/hugegraph_ml/data/hugegraph2dgl.py +++ b/hugegraph-ml/src/hugegraph_ml/data/hugegraph2dgl.py @@ -26,7 +26,7 @@ from pyhugegraph.client import PyHugeClient from hugegraph_ml.data.hugegraph_dataset import HugeGraphDataset - +import networkx as nx class HugeGraph2DGL: def __init__( @@ -150,6 +150,132 @@ def convert_graph_dataset( dataset_dgl = HugeGraphDataset(graphs=graphs, labels=graph_labels, info=graphs_info) return dataset_dgl + def convert_graph_nx( + self, + vertex_label: str, + edge_label: str, + ): + vertices = self._graph_germlin.exec(f"g.V().hasLabel('{vertex_label}')")["data"] + edges = self._graph_germlin.exec(f"g.E().hasLabel('{edge_label}')")["data"] + graph_nx = self._convert_graph_from_v_e_nx(vertices=vertices, edges=edges) + return graph_nx + + def convert_graph_with_edge_feat( + self, + vertex_label: str, + edge_label: str, + node_feat_key: str = "feat", + edge_feat_key: str = "edge_feat", + label_key: str = "label", + mask_keys: Optional[List[str]] = None, + ): + if mask_keys is None: + mask_keys = ["train_mask", "val_mask", "test_mask"] + vertices = self._graph_germlin.exec(f"g.V().hasLabel('{vertex_label}')")["data"] + edges = self._graph_germlin.exec(f"g.E().hasLabel('{edge_label}')")["data"] + graph_dgl = self._convert_graph_from_v_e_with_edge_feat( + vertices, edges, edge_feat_key, node_feat_key, label_key, mask_keys + ) + + return graph_dgl + + def convert_graph_ogb(self, vertex_label: str, edge_label: str, split_label: str): + vertices = self._graph_germlin.exec(f"g.V().hasLabel('{vertex_label}')")["data"] + edges = self._graph_germlin.exec(f"g.E().hasLabel('{edge_label}')")["data"] + graph_dgl, vertex_id_to_idx = self._convert_graph_from_ogb( + vertices, edges, "feat", "year", "weight" + ) + edges_split = self._graph_germlin.exec(f"g.E().hasLabel('{split_label}')")[ + "data" + ] + split_edge = self._convert_split_edge_from_ogb(edges_split, vertex_id_to_idx) + return graph_dgl, split_edge + + def convert_hetero_graph_bgnn( + self, + vertex_labels: List[str], + edge_labels: List[str], + feat_key: str = "feat", + label_key: str = "class", + cat_key: str = "cat_features", + mask_keys: Optional[List[str]] = None, + ): + if mask_keys is None: + mask_keys = ["train_mask", "val_mask", "test_mask"] + vertex_label_id2idx = {} + vertex_label_data = {} + # for each vertex label + for vertex_label in vertex_labels: + vertices = self._graph_germlin.exec(f"g.V().hasLabel('{vertex_label}')")[ + "data" + ] + if len(vertices) == 0: + warnings.warn( + f"Graph has no vertices of vertex_label: {vertex_label}", Warning + ) + else: + vertex_ids = [v["id"] for v in vertices] + id2idx = {vertex_id: idx for idx, vertex_id in enumerate(vertex_ids)} + vertex_label_id2idx[vertex_label] = id2idx + # extract vertex property(feat, label, mask) + vertex_label_data[vertex_label] = {} + if feat_key in vertices[0]["properties"]: + node_feats = torch.tensor( + [v["properties"][feat_key] for v in vertices], + dtype=torch.int32, + ) + vertex_label_data[vertex_label]["feat"] = node_feats + if label_key in vertices[0]["properties"]: + node_labels = torch.tensor( + [v["properties"][label_key] for v in vertices], + dtype=torch.float64, + ) + vertex_label_data[vertex_label]["class"] = node_labels + if cat_key in vertices[0]["properties"]: + node_cat = torch.tensor( + [v["properties"][cat_key] for v in vertices], + dtype=torch.int32, + ) + vertex_label_data[vertex_label]["cat_features"] = node_cat + if mask_keys: + for mk in mask_keys: + if mk in vertices[0]["properties"]: + mask = torch.tensor( + [v["properties"][mk] for v in vertices], + dtype=torch.bool, + ) + vertex_label_data[vertex_label][mk] = mask + # build hetero graph from edges + edge_data_dict = {} + for edge_label in edge_labels: + edges = self._graph_germlin.exec(f"g.E().hasLabel('{edge_label}')")["data"] + if len(edges) == 0: + warnings.warn( + f"Graph has no edges of edge_label: {edge_label}", Warning + ) + else: + src_vertex_label = edges[0]["outVLabel"] + src_idx = [ + vertex_label_id2idx[src_vertex_label][e["outV"]] for e in edges + ] + dst_vertex_label = edges[0]["inVLabel"] + dst_idx = [ + vertex_label_id2idx[dst_vertex_label][e["inV"]] for e in edges + ] + edge_data_dict[(src_vertex_label, edge_label, dst_vertex_label)] = ( + src_idx, + dst_idx, + ) + # add vertex properties data + hetero_graph = dgl.heterograph(edge_data_dict) + for vertex_label in vertex_labels: + for prop in vertex_label_data[vertex_label]: + hetero_graph.nodes[vertex_label].data[prop] = vertex_label_data[ + vertex_label + ][prop] + + return hetero_graph + @staticmethod def _convert_graph_from_v_e(vertices, edges, feat_key=None, label_key=None, mask_keys=None): if len(vertices) == 0: @@ -175,6 +301,154 @@ def _convert_graph_from_v_e(vertices, edges, feat_key=None, label_key=None, mask graph_dgl.ndata[mk] = mask return graph_dgl + @staticmethod + def _convert_graph_from_v_e_nx(vertices, edges): + if len(vertices) == 0: + warnings.warn("This graph has no vertices", Warning) + return nx.Graph(()) + vertex_ids = [v["id"] for v in vertices] + vertex_id_to_idx = {vertex_id: idx for idx, vertex_id in enumerate(vertex_ids)} + new_vertex_ids = [vertex_id_to_idx[id] for id in vertex_ids] + edge_list = [(edge["outV"], edge["inV"]) for edge in edges] + new_edge_list = [ + (vertex_id_to_idx[src], vertex_id_to_idx[dst]) for src, dst in edge_list + ] + graph_nx = nx.Graph() + graph_nx.add_nodes_from(new_vertex_ids) + graph_nx.add_edges_from(new_edge_list) + return graph_nx + + @staticmethod + def _convert_graph_from_v_e_with_edge_feat( + vertices, + edges, + edge_feat_key, + node_feat_key=None, + label_key=None, + mask_keys=None, + ): + if len(vertices) == 0: + warnings.warn("This graph has no vertices", Warning) + return dgl.graph(()) + vertex_ids = [v["id"] for v in vertices] + vertex_id_to_idx = {vertex_id: idx for idx, vertex_id in enumerate(vertex_ids)} + src_idx = [vertex_id_to_idx[e["outV"]] for e in edges] + dst_idx = [vertex_id_to_idx[e["inV"]] for e in edges] + graph_dgl = dgl.graph((src_idx, dst_idx)) + + if node_feat_key and node_feat_key in vertices[0]["properties"]: + node_feats = [v["properties"][node_feat_key] for v in vertices] + graph_dgl.ndata["feat"] = torch.tensor(node_feats, dtype=torch.int64) + if edge_feat_key and edge_feat_key in edges[0]["properties"]: + edge_feats = [e["properties"][edge_feat_key] for e in edges] + graph_dgl.edata["feat"] = torch.tensor(edge_feats, dtype=torch.int64) + if label_key and label_key in vertices[0]["properties"]: + node_labels = [v["properties"][label_key] for v in vertices] + graph_dgl.ndata["label"] = torch.tensor(node_labels, dtype=torch.long) + if mask_keys: + for mk in mask_keys: + if mk in vertices[0]["properties"]: + node_masks = [v["properties"][mk] for v in vertices] + mask = torch.tensor(node_masks, dtype=torch.bool) + graph_dgl.ndata[mk] = mask + return graph_dgl + + @staticmethod + def _convert_graph_from_ogb(vertices, edges, feat_key, year_key, weight_key): + if len(vertices) == 0: + warnings.warn("This graph has no vertices", Warning) + return dgl.graph(()) + vertex_ids = [v["id"] for v in vertices] + vertex_id_to_idx = {vertex_id: idx for idx, vertex_id in enumerate(vertex_ids)} + src_idx = [vertex_id_to_idx[e["outV"]] for e in edges] + dst_idx = [vertex_id_to_idx[e["inV"]] for e in edges] + graph_dgl = dgl.graph((src_idx, dst_idx)) + if feat_key and feat_key in vertices[0]["properties"]: + node_feats = [ + v["properties"][feat_key] + for v in vertices[0 : graph_dgl.number_of_nodes()] + ] + graph_dgl.ndata["feat"] = torch.tensor(node_feats, dtype=torch.float32) + if year_key and year_key in edges[0]["properties"]: + year = [e["properties"][year_key] for e in edges] + graph_dgl.edata["year"] = torch.tensor(year, dtype=torch.int64) + if weight_key and weight_key in edges[0]["properties"]: + weight = [e["properties"][weight_key] for e in edges] + graph_dgl.edata["weight"] = torch.tensor(weight, dtype=torch.int64) + + return graph_dgl, vertex_id_to_idx + + @staticmethod + def _convert_split_edge_from_ogb(edges, vertex_id_to_idx): + train_edge_list = [] + train_year_list = [] + train_weight_list = [] + valid_edge_list = [] + valid_year_list = [] + valid_weight_list = [] + valid_edge_neg_list = [] + test_edge_list = [] + test_year_list = [] + test_weight_list = [] + test_edge_neg_list = [] + + for edge in edges: + if edge["properties"]["train_edge_mask"] == 1: + train_edge_list.append( + [vertex_id_to_idx[edge["outV"]], vertex_id_to_idx[edge["inV"]]] + ) + if edge["properties"]["train_year_mask"] != -1: + train_year_list.append(edge["properties"]["train_year_mask"]) + if edge["properties"]["train_weight_mask"] != -1: + train_weight_list.append(edge["properties"]["train_weight_mask"]) + + if edge["properties"]["valid_edge_mask"] == 1: + valid_edge_list.append( + [vertex_id_to_idx[edge["outV"]], vertex_id_to_idx[edge["inV"]]] + ) + if edge["properties"]["valid_year_mask"] != -1: + valid_year_list.append(edge["properties"]["valid_year_mask"]) + if edge["properties"]["valid_weight_mask"] != -1: + valid_weight_list.append(edge["properties"]["valid_weight_mask"]) + if edge["properties"]["valid_edge_neg_mask"] == 1: + valid_edge_neg_list.append( + [vertex_id_to_idx[edge["outV"]], vertex_id_to_idx[edge["inV"]]] + ) + + if edge["properties"]["test_edge_mask"] == 1: + test_edge_list.append( + [vertex_id_to_idx[edge["outV"]], vertex_id_to_idx[edge["inV"]]] + ) + if edge["properties"]["test_year_mask"] != -1: + test_year_list.append(edge["properties"]["test_year_mask"]) + if edge["properties"]["test_weight_mask"] != -1: + test_weight_list.append(edge["properties"]["test_weight_mask"]) + if edge["properties"]["test_edge_neg_mask"] == 1: + test_edge_neg_list.append( + [vertex_id_to_idx[edge["outV"]], vertex_id_to_idx[edge["inV"]]] + ) + + split_edge = { + "train": { + "edge": torch.tensor(train_edge_list), + "weight": torch.tensor(train_weight_list), + "year": torch.tensor(train_year_list), + }, + "valid": { + "edge": torch.tensor(valid_edge_list), + "weight": torch.tensor(valid_weight_list), + "year": torch.tensor(valid_year_list), + "edge_neg": torch.tensor(valid_edge_neg_list), + }, + "test": { + "edge": torch.tensor(test_edge_list), + "weight": torch.tensor(test_weight_list), + "year": torch.tensor(test_year_list), + "edge_neg": torch.tensor(test_edge_neg_list), + }, + } + + return split_edge if __name__ == "__main__": hg2d = HugeGraph2DGL() @@ -188,3 +462,22 @@ def _convert_graph_from_v_e(vertices, edges, feat_key=None, label_key=None, mask vertex_labels=["ACM_paper_v", "ACM_author_v", "ACM_field_v"], edge_labels=["ACM_ap_e", "ACM_fp_e", "ACM_pa_e", "ACM_pf_e"] ) + hg2d.convert_graph_nx(vertex_label="CAVEMAN_vertex", edge_label="CAVEMAN_edge") + hg2d.convert_graph_with_edge_feat( + vertex_label="CORA_edge_feat_vertex", edge_label="CORA_edge_feat_edge" + ) + hg2d.convert_graph_ogb( + vertex_label="ogbl-collab_vertex", + edge_label="ogbl-collab_edge", + split_label="ogbl-collab_split_edge", + ) + hg2d.convert_hetero_graph_bgnn( + vertex_labels=["AVAZU__N_v"], edge_labels=["AVAZU__E_e"] + ) + hg2d.convert_hetero_graph( + vertex_labels=["AMAZONGATNE__N_v"], + edge_labels=[ + "AMAZONGATNE_1_e", + "AMAZONGATNE_2_e", + ], + ) \ No newline at end of file diff --git a/hugegraph-ml/src/hugegraph_ml/examples/agnn_example.py b/hugegraph-ml/src/hugegraph_ml/examples/agnn_example.py new file mode 100644 index 00000000..5b5b14ba --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/examples/agnn_example.py @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from hugegraph_ml.data.hugegraph2dgl import HugeGraph2DGL +from hugegraph_ml.models.agnn import AGNN +from hugegraph_ml.tasks.node_classify import NodeClassify + + +def agnn_example(n_epochs=200): + hg2d = HugeGraph2DGL() + graph = hg2d.convert_graph(vertex_label="CORA_vertex", edge_label="CORA_edge") + model = AGNN( + num_layers=2, + in_dim=graph.ndata["feat"].shape[1], + hid_dim=64, + out_dim=graph.ndata["label"].unique().shape[0], + dropout=0.2, + ) + node_clf_task = NodeClassify(graph, model) + node_clf_task.train(lr=0.005, weight_decay=0.0005, n_epochs=n_epochs, patience=200) + print(node_clf_task.evaluate()) + + +if __name__ == "__main__": + agnn_example() diff --git a/hugegraph-ml/src/hugegraph_ml/examples/appnp_example.py b/hugegraph-ml/src/hugegraph_ml/examples/appnp_example.py new file mode 100644 index 00000000..6754b747 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/examples/appnp_example.py @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from hugegraph_ml.data.hugegraph2dgl import HugeGraph2DGL +from hugegraph_ml.models.appnp import APPNP +from hugegraph_ml.tasks.node_classify import NodeClassify +import torch.nn.functional as F + + +def appnp_example(n_epochs=200): + hg2d = HugeGraph2DGL() + graph = hg2d.convert_graph(vertex_label="CORA_vertex", edge_label="CORA_edge") + model = APPNP( + in_feats=graph.ndata["feat"].shape[1], + hiddens=[64], + n_classes=graph.ndata["label"].unique().shape[0], + activation=F.relu, + feat_drop=0.5, + edge_drop=0.5, + alpha=0.1, + k=10, + ) + node_clf_task = NodeClassify(graph, model) + node_clf_task.train(lr=0.005, weight_decay=0.0005, n_epochs=n_epochs, patience=200) + print(node_clf_task.evaluate()) + + +if __name__ == "__main__": + appnp_example() diff --git a/hugegraph-ml/src/hugegraph_ml/examples/arma_example.py b/hugegraph-ml/src/hugegraph_ml/examples/arma_example.py new file mode 100644 index 00000000..0c75b5be --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/examples/arma_example.py @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from hugegraph_ml.data.hugegraph2dgl import HugeGraph2DGL +from hugegraph_ml.models.arma import ARMA4NC +from hugegraph_ml.tasks.node_classify import NodeClassify +from torch import nn + + +def arma_example(n_epochs=200): + hg2d = HugeGraph2DGL() + graph = hg2d.convert_graph(vertex_label="CORA_vertex", edge_label="CORA_edge") + model = ARMA4NC( + in_dim=graph.ndata["feat"].shape[1], + hid_dim=16, + out_dim=graph.ndata["label"].unique().shape[0], + num_stacks=2, + num_layers=1, + activation=nn.ReLU(), + dropout=0.75, + ) + node_clf_task = NodeClassify(graph, model) + node_clf_task.train(lr=0.005, weight_decay=0.0005, n_epochs=n_epochs, patience=200) + print(node_clf_task.evaluate()) + + +if __name__ == "__main__": + arma_example() diff --git a/hugegraph-ml/src/hugegraph_ml/examples/bgnn_example.py b/hugegraph-ml/src/hugegraph_ml/examples/bgnn_example.py new file mode 100644 index 00000000..7c353f3b --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/examples/bgnn_example.py @@ -0,0 +1,67 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from hugegraph_ml.models.bgnn import ( + GNNModelDGL, + BGNNPredictor, + encode_cat_features, + replace_na, + convert_data, +) +from hugegraph_ml.data.hugegraph2dgl import HugeGraph2DGL + + +def bgnn_example(): + hg2d = HugeGraph2DGL() + g = hg2d.convert_hetero_graph_bgnn( + vertex_labels=["AVAZU__N_v"], edge_labels=["AVAZU__E_e"] + ) + X, y, cat_features, train_mask, val_mask, test_mask = convert_data(g) + encoded_X = X.copy() + encoded_X = encode_cat_features( + encoded_X, y, cat_features, train_mask, val_mask, test_mask + ) + encoded_X = replace_na(encoded_X, train_mask) + gnn_model = GNNModelDGL(in_dim=y.shape[1], hidden_dim=128, out_dim=y.shape[1]) + bgnn = BGNNPredictor( + gnn_model, + task="regression", + loss_fn=None, + trees_per_epoch=5, + backprop_per_epoch=5, + lr=0.1, + append_gbdt_pred=False, + gbdt_depth=6, + gbdt_lr=0.1, + ) + metrics = bgnn.fit( + g, + encoded_X, + y, + train_mask, + val_mask, + test_mask, + original_X=X, + cat_features=cat_features, + num_epochs=100, + patience=10, + metric_name="loss", + ) + + +if __name__ == "__main__": + bgnn_example() diff --git a/hugegraph-ml/src/hugegraph_ml/examples/bgrl_example.py b/hugegraph-ml/src/hugegraph_ml/examples/bgrl_example.py new file mode 100644 index 00000000..77c03f96 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/examples/bgrl_example.py @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from hugegraph_ml.data.hugegraph2dgl import HugeGraph2DGL +from hugegraph_ml.models.bgrl import BGRL, GCN, MLP_Predictor, CosineDecayScheduler, get_graph_drop_transform +from hugegraph_ml.models.mlp import MLPClassifier +from hugegraph_ml.tasks.node_classify import NodeClassify +from hugegraph_ml.tasks.node_embed import NodeEmbed + + +def bgrl_example(n_epochs_embed=300, n_epochs_clf=400): + hg2d = HugeGraph2DGL() + graph = hg2d.convert_graph(vertex_label="CORA_vertex", edge_label="CORA_edge") + encoder = GCN([graph.ndata["feat"].size(1)] + [256, 128]) + predictor = MLP_Predictor( + input_size=128, + output_size=128, + hidden_size=512, + ) + model = BGRL(encoder=encoder, predictor=predictor) + node_embed_task = NodeEmbed(graph=graph, model=model) + embedded_graph = node_embed_task.train_and_embed( + add_self_loop=True, lr=0.001, weight_decay=1e-5, n_epochs=n_epochs_embed, patience=40 + ) + model = MLPClassifier( + n_in_feat=embedded_graph.ndata["feat"].shape[1], + n_out_feat=embedded_graph.ndata["label"].unique().shape[0], + n_hidden=128 + ) + node_clf_task = NodeClassify(graph=embedded_graph, model=model) + node_clf_task.train(lr=1e-3, n_epochs=n_epochs_clf, patience=30) + print(node_clf_task.evaluate()) + + +if __name__ == "__main__": + bgrl_example() diff --git a/hugegraph-ml/src/hugegraph_ml/examples/care_gnn_example.py b/hugegraph-ml/src/hugegraph_ml/examples/care_gnn_example.py new file mode 100644 index 00000000..e6fb52f2 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/examples/care_gnn_example.py @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from hugegraph_ml.data.hugegraph2dgl import HugeGraph2DGL +from hugegraph_ml.models.care_gnn import CAREGNN +from hugegraph_ml.tasks.fraud_detector_caregnn import DetectorCaregnn + +import dgl +import torch + + +def care_gnn_example(n_epochs=200): + hg2d = HugeGraph2DGL() + graph = hg2d.convert_hetero_graph( + vertex_labels=["AMAZON_user_v"], + edge_labels=[ + "AMAZON_net_upu_e", + "AMAZON_net_usu_e", + "AMAZON_net_uvu_e", + ], + ) + model = CAREGNN( + in_dim=graph.ndata["feature"].shape[-1], + num_classes=graph.ndata["label"].unique().shape[0], + hid_dim=64, + num_layers=1, + activation=torch.tanh, + step_size=0.02, + edges=graph.canonical_etypes, + ) + detector_task = DetectorCaregnn(graph, model) + detector_task.train(lr=0.005, weight_decay=0.0005, n_epochs=n_epochs) + print(detector_task.evaluate()) + + +if __name__ == "__main__": + care_gnn_example() diff --git a/hugegraph-ml/src/hugegraph_ml/examples/cluster_gcn_example.py b/hugegraph-ml/src/hugegraph_ml/examples/cluster_gcn_example.py new file mode 100644 index 00000000..3cdcf8e3 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/examples/cluster_gcn_example.py @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from hugegraph_ml.data.hugegraph2dgl import HugeGraph2DGL +from hugegraph_ml.models.cluster_gcn import SAGE +from hugegraph_ml.tasks.node_classify_with_sample import NodeClassifyWithSample + + +def cluster_gcn_example(n_epochs=200): + hg2d = HugeGraph2DGL() + graph = hg2d.convert_graph(vertex_label="CORA_vertex", edge_label="CORA_edge") + model = SAGE( + in_feats=graph.ndata["feat"].shape[1], + n_hidden=64, + n_classes=graph.ndata["label"].unique().shape[0], + ) + node_clf_task = NodeClassifyWithSample(graph, model) + node_clf_task.train(lr=0.005, weight_decay=0.0005, n_epochs=n_epochs, patience=200) + print(node_clf_task.evaluate()) + + +if __name__ == "__main__": + cluster_gcn_example() diff --git a/hugegraph-ml/src/hugegraph_ml/examples/correct_and_smooth_example.py b/hugegraph-ml/src/hugegraph_ml/examples/correct_and_smooth_example.py new file mode 100644 index 00000000..6faae7f1 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/examples/correct_and_smooth_example.py @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from hugegraph_ml.data.hugegraph2dgl import HugeGraph2DGL +from hugegraph_ml.models.correct_and_smooth import MLP, MLPLinear +from hugegraph_ml.tasks.node_classify import NodeClassify +import argparse + + +def cs_example(n_epochs=200): + hg2d = HugeGraph2DGL() + graph = hg2d.convert_graph(vertex_label="CORA_vertex", edge_label="CORA_edge") + if args.model == "mlp": + model = MLP( + in_dim=graph.ndata["feat"].shape[1], + hid_dim=64, + out_dim=graph.ndata["label"].unique().shape[0], + num_layers=3, + dropout=0.4, + ) + elif args.model == "linear": + model = MLPLinear( + in_dim=graph.ndata["feat"].shape[1], + out_dim=graph.ndata["label"].unique().shape[0], + ) + else: + raise NotImplementedError(f"Model {args.model} is not supported.") + node_clf_task = NodeClassify(graph, model) + node_clf_task.train(lr=0.005, weight_decay=0.0005, n_epochs=n_epochs, patience=200) + print(node_clf_task.evaluate()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Base predictor(C&S)") + parser.add_argument("--model", type=str, default="mlp", choices=["mlp", "linear"]) + args = parser.parse_args() + cs_example() diff --git a/hugegraph-ml/src/hugegraph_ml/examples/dagnn_example.py b/hugegraph-ml/src/hugegraph_ml/examples/dagnn_example.py new file mode 100644 index 00000000..38f3e96d --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/examples/dagnn_example.py @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from hugegraph_ml.data.hugegraph2dgl import HugeGraph2DGL +from hugegraph_ml.models.dagnn import DAGNN +from hugegraph_ml.tasks.node_classify import NodeClassify + + +def dagnn_example(n_epochs=200): + hg2d = HugeGraph2DGL() + graph = hg2d.convert_graph(vertex_label="CORA_vertex", edge_label="CORA_edge") + model = DAGNN( + k=12, + in_dim=graph.ndata["feat"].shape[1], + hid_dim=64, + out_dim=graph.ndata["label"].unique().shape[0], + dropout=0.8, + ) + node_clf_task = NodeClassify(graph, model) + node_clf_task.train(lr=0.005, weight_decay=0.0005, n_epochs=n_epochs, patience=200) + print(node_clf_task.evaluate()) + + +if __name__ == "__main__": + dagnn_example() diff --git a/hugegraph-ml/src/hugegraph_ml/examples/deepergcn_example.py b/hugegraph-ml/src/hugegraph_ml/examples/deepergcn_example.py new file mode 100644 index 00000000..197826e2 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/examples/deepergcn_example.py @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from hugegraph_ml.data.hugegraph2dgl import HugeGraph2DGL +from hugegraph_ml.models.deepergcn import DeeperGCN +from hugegraph_ml.tasks.node_classify_with_edge import NodeClassifyWithEdge + + +def deepergcn_example(n_epochs=1000): + hg2d = HugeGraph2DGL() + graph = hg2d.convert_graph_with_edge_feat( + vertex_label="CORA_vertex", edge_label="CORA_edge" + ) + model = DeeperGCN( + node_feat_dim=graph.ndata["feat"].shape[1], + edge_feat_dim=graph.edata["feat"].shape[1], + hid_dim=256, + out_dim=graph.ndata["label"].unique().shape[0], + num_layers=7, + dropout=0.2, + ) + node_clf_task = NodeClassifyWithEdge(graph, model) + node_clf_task.train(lr=0.005, weight_decay=0.0005, n_epochs=n_epochs, patience=200) + print(node_clf_task.evaluate()) + + +if __name__ == "__main__": + deepergcn_example() diff --git a/hugegraph-ml/src/hugegraph_ml/examples/gatne_example.py b/hugegraph-ml/src/hugegraph_ml/examples/gatne_example.py new file mode 100644 index 00000000..0c9c0c5c --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/examples/gatne_example.py @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from hugegraph_ml.data.hugegraph2dgl import HugeGraph2DGL +from hugegraph_ml.models.gatne import DGLGATNE, NeighborSampler +from hugegraph_ml.tasks.hetero_sample_embed_gatne import HeteroSampleEmbedGATNE + + +def gatne_example(n_epochs=200): + hg2d = HugeGraph2DGL() + graph = hg2d.convert_hetero_graph( + vertex_labels=["AMAZONGATNE__N_v"], + edge_labels=[ + "AMAZONGATNE_1_e", + "AMAZONGATNE_2_e", + ], + ) + model = DGLGATNE( + graph.number_of_nodes(), + 200, + 10, + graph.etypes, + len(graph.etypes), + 20, + ) + gatne_task = HeteroSampleEmbedGATNE(graph, model) + embs = gatne_task.train_and_embed(lr=0.005, n_epochs=n_epochs) + + +if __name__ == "__main__": + gatne_example() diff --git a/hugegraph-ml/src/hugegraph_ml/examples/pgnn_example.py b/hugegraph-ml/src/hugegraph_ml/examples/pgnn_example.py new file mode 100644 index 00000000..7297de23 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/examples/pgnn_example.py @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from hugegraph_ml.data.hugegraph2dgl import HugeGraph2DGL +from hugegraph_ml.models.pgnn import PGNN, get_dataset +from hugegraph_ml.tasks.link_prediction_pgnn import LinkPredictionPGNN + + +def pgnn_example(n_epochs=200): + hg2d = HugeGraph2DGL() + graph = hg2d.convert_graph_nx( + vertex_label="CAVEMAN_vertex", edge_label="CAVEMAN_edge" + ) + model = PGNN(input_dim=get_dataset(graph)["feature"].shape[1]) + link_pre_task = LinkPredictionPGNN(graph, model) + link_pre_task.train(lr=0.005, weight_decay=0.0005, n_epochs=n_epochs) + + +if __name__ == "__main__": + pgnn_example() diff --git a/hugegraph-ml/src/hugegraph_ml/examples/seal_example.py b/hugegraph-ml/src/hugegraph_ml/examples/seal_example.py new file mode 100644 index 00000000..3d6e7d3b --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/examples/seal_example.py @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from hugegraph_ml.data.hugegraph2dgl import HugeGraph2DGL +from hugegraph_ml.models.seal import DGCNN, data_prepare +from hugegraph_ml.tasks.link_prediction_seal import LinkPredictionSeal +import torch + + +def seal_example(n_epochs=200): + torch.manual_seed(2021) + hg2d = HugeGraph2DGL() + graph, split_edge = hg2d.convert_graph_ogb( + vertex_label="ogbl-collab_vertex", + edge_label="ogbl-collab_edge", + split_label="ogbl-collab_split_edge", + ) + node_attribute, edge_weight = data_prepare(graph=graph, split_edge=split_edge) + model = DGCNN( + num_layers=3, + hidden_units=32, + k=30, + gcn_type="gcn", + node_attributes=node_attribute, + edge_weights=edge_weight, + node_embedding=None, + use_embedding=True, + num_nodes=graph.num_nodes(), + dropout=0.5, + ) + link_pre_task = LinkPredictionSeal(graph, split_edge, model) + link_pre_task.train(lr=0.005, n_epochs=n_epochs) + + +if __name__ == "__main__": + seal_example() diff --git a/hugegraph-ml/src/hugegraph_ml/models/agnn.py b/hugegraph-ml/src/hugegraph_ml/models/agnn.py new file mode 100644 index 00000000..b3c3767d --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/models/agnn.py @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Attention-based Graph Neural Network (AGNN) + +References +---------- +Paper: https://arxiv.org/abs/1803.03735 +Author's code: +DGL code: https://github.com/dmlc/dgl/blob/master/python/dgl/nn/pytorch/conv/agnnconv.py +""" + +import dgl +import torch +from dgl.nn.pytorch.conv import AGNNConv +import torch.nn as nn +import torch.nn.functional as F + + +class AGNN(nn.Module): + def __init__(self, num_layers, in_dim, hid_dim, out_dim, dropout): + super().__init__() + self.num_layers = num_layers + self.embedding_layer = nn.Linear(in_dim, hid_dim, bias=False) + + self.attention_layers = nn.ModuleList() + # 2-layer AGNN + for i in range(self.num_layers): + self.attention_layers.append(AGNNConv()) + + self.output_layer = nn.Linear(hid_dim, out_dim, bias=False) + + self.dropout = nn.Dropout(dropout) + + self.criterion = nn.CrossEntropyLoss() + + def forward(self, graph, features): + h = F.relu(self.embedding_layer(features)) + for i in range(self.num_layers): + self.attention_layers[i](graph, h) + h = self.output_layer(h) + h = self.dropout(h) + return h + + def loss(self, logits, labels): + return self.criterion(logits, labels) + + def inference(self, graph, feats): + return self.forward(graph, feats) diff --git a/hugegraph-ml/src/hugegraph_ml/models/appnp.py b/hugegraph-ml/src/hugegraph_ml/models/appnp.py new file mode 100644 index 00000000..04cb19d5 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/models/appnp.py @@ -0,0 +1,84 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License.right (c) 2024 by jinsong, All Rights Reserved. + +""" +Approximate personalized propagation of neural predictions (APPNP) + +References +---------- +Paper: https://arxiv.org/abs/1810.05997 +Author's code: https://github.com/klicperajo/ppnp +DGL code: https://github.com/dmlc/dgl/tree/master/examples/pytorch/appnp +""" + +import torch.nn as nn + +from dgl.nn.pytorch.conv import APPNPConv + + +class APPNP(nn.Module): + def __init__( + self, + in_feats, + hiddens, + n_classes, + activation, + feat_drop, + edge_drop, + alpha, + k, + ): + super(APPNP, self).__init__() + self.layers = nn.ModuleList() + # input layer + self.layers.append(nn.Linear(in_feats, hiddens[0])) + # hidden layers + for i in range(1, len(hiddens)): + self.layers.append(nn.Linear(hiddens[i - 1], hiddens[i])) + # output layer + self.layers.append(nn.Linear(hiddens[-1], n_classes)) + self.activation = activation + if feat_drop: + self.feat_drop = nn.Dropout(feat_drop) + else: + self.feat_drop = lambda x: x + self.propagate = APPNPConv(k, alpha, edge_drop) + self.reset_parameters() + + self.criterion = nn.CrossEntropyLoss() + + def reset_parameters(self): + for layer in self.layers: + layer.reset_parameters() + + def forward(self, graph, features): + # prediction step + h = features + h = self.feat_drop(h) + h = self.activation(self.layers[0](h)) + for layer in self.layers[1:-1]: + h = self.activation(layer(h)) + h = self.layers[-1](self.feat_drop(h)) + # propagation step + h = self.propagate(graph, h) + return h + + def loss(self, logits, labels): + return self.criterion(logits, labels) + + def inference(self, graph, feats): + return self.forward(graph, feats) diff --git a/hugegraph-ml/src/hugegraph_ml/models/arma.py b/hugegraph-ml/src/hugegraph_ml/models/arma.py new file mode 100644 index 00000000..7b07c684 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/models/arma.py @@ -0,0 +1,175 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License.right (c) 2024 by jinsong, All Rights Reserved. + +""" +auto-regressive moving average (ARMA) + +References +---------- +Paper: https://arxiv.org/abs/1901.01343 +Author's code: +DGL code: https://github.com/dmlc/dgl/tree/master/examples/pytorch/arma +""" + +import math + +import dgl.function as fn + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def glorot(tensor): + if tensor is not None: + stdv = math.sqrt(6.0 / (tensor.size(-2) + tensor.size(-1))) + tensor.data.uniform_(-stdv, stdv) + + +def zeros(tensor): + if tensor is not None: + tensor.data.fill_(0) + + +class ARMAConv(nn.Module): + def __init__( + self, + in_dim, + out_dim, + num_stacks, + num_layers, + activation=None, + dropout=0.0, + bias=True, + ): + super(ARMAConv, self).__init__() + + self.in_dim = in_dim + self.out_dim = out_dim + self.K = num_stacks + self.T = num_layers + self.activation = activation + self.dropout = nn.Dropout(p=dropout) + + # init weight + self.w_0 = nn.ModuleDict( + {str(k): nn.Linear(in_dim, out_dim, bias=False) for k in range(self.K)} + ) + # deeper weight + self.w = nn.ModuleDict( + {str(k): nn.Linear(out_dim, out_dim, bias=False) for k in range(self.K)} + ) + # v + self.v = nn.ModuleDict( + {str(k): nn.Linear(in_dim, out_dim, bias=False) for k in range(self.K)} + ) + # bias + if bias: + self.bias = nn.Parameter(torch.Tensor(self.K, self.T, 1, self.out_dim)) + else: + self.register_parameter("bias", None) + + self.reset_parameters() + + def reset_parameters(self): + for k in range(self.K): + glorot(self.w_0[str(k)].weight) + glorot(self.w[str(k)].weight) + glorot(self.v[str(k)].weight) + zeros(self.bias) + + def forward(self, g, feats): + with g.local_scope(): + init_feats = feats + # assume that the graphs are undirected and graph.in_degrees() is the same as graph.out_degrees() + degs = g.in_degrees().float().clamp(min=1) + norm = torch.pow(degs, -0.5).to(feats.device).unsqueeze(1) + output = [] + + for k in range(self.K): + feats = init_feats + for t in range(self.T): + feats = feats * norm + g.ndata["h"] = feats + g.update_all(fn.copy_u("h", "m"), fn.sum("m", "h")) + feats = g.ndata.pop("h") + feats = feats * norm + + if t == 0: + feats = self.w_0[str(k)](feats) + else: + feats = self.w[str(k)](feats) + + feats += self.dropout(self.v[str(k)](init_feats)) + feats += self.v[str(k)](self.dropout(init_feats)) + + if self.bias is not None: + feats += self.bias[k][t] + + if self.activation is not None: + feats = self.activation(feats) + output.append(feats) + + return torch.stack(output).mean(dim=0) + + +class ARMA4NC(nn.Module): + def __init__( + self, + in_dim, + hid_dim, + out_dim, + num_stacks, + num_layers, + activation=None, + dropout=0.0, + ): + super(ARMA4NC, self).__init__() + + self.conv1 = ARMAConv( + in_dim=in_dim, + out_dim=hid_dim, + num_stacks=num_stacks, + num_layers=num_layers, + activation=activation, + dropout=dropout, + ) + + self.conv2 = ARMAConv( + in_dim=hid_dim, + out_dim=out_dim, + num_stacks=num_stacks, + num_layers=num_layers, + activation=activation, + dropout=dropout, + ) + + self.dropout = nn.Dropout(p=dropout) + + self.criterion = nn.CrossEntropyLoss() + + def forward(self, g, feats): + feats = F.relu(self.conv1(g, feats)) + feats = self.dropout(feats) + feats = self.conv2(g, feats) + return feats + + def loss(self, logits, labels): + return self.criterion(logits, labels) + + def inference(self, graph, feats): + return self.forward(graph, feats) diff --git a/hugegraph-ml/src/hugegraph_ml/models/bgnn.py b/hugegraph-ml/src/hugegraph_ml/models/bgnn.py new file mode 100644 index 00000000..3db380db --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/models/bgnn.py @@ -0,0 +1,741 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Boost-GNN (BGNN) + +References +---------- +Paper: https://arxiv.org/abs/2101.08543 +Author's code: https://github.com/nd7141/bgnn +DGL code: https://github.com/dmlc/dgl/tree/master/examples/pytorch/bgnn +""" + +import itertools +import time +from collections import defaultdict as ddict + +import numpy as np +import pandas as pd +import torch +import torch.nn.functional as F +from catboost import CatBoostClassifier, CatBoostRegressor, Pool, sum_models +from sklearn import preprocessing +from sklearn.metrics import r2_score +from tqdm import tqdm +from category_encoders import CatBoostEncoder +from dgl.nn.pytorch import ( + AGNNConv as AGNNConvDGL, + APPNPConv, + ChebConv as ChebConvDGL, + GATConv as GATConvDGL, + GraphConv, +) +from torch.nn import Dropout, ELU, Linear, ReLU, Sequential + + +class BGNNPredictor: + """ + Description + ----------- + Boost GNN predictor for semi-supervised node classification or regression problems. + Publication: https://arxiv.org/abs/2101.08543 + + Parameters + ---------- + gnn_model : nn.Module + DGL implementation of GNN model. + task: str, optional + Regression or classification task. + loss_fn : callable, optional + Function that takes torch tensors, pred and true, and returns a scalar. + trees_per_epoch : int, optional + Number of GBDT trees to build each epoch. + backprop_per_epoch : int, optional + Number of backpropagation steps to make each epoch. + lr : float, optional + Learning rate of gradient descent optimizer. + append_gbdt_pred : bool, optional + Append GBDT predictions or replace original input node features. + train_input_features : bool, optional + Train original input node features. + gbdt_depth : int, optional + Depth of each tree in GBDT model. + gbdt_lr : float, optional + Learning rate of GBDT model. + gbdt_alpha : int, optional + Weight to combine previous and new GBDT trees. + random_seed : int, optional + random seed for GNN and GBDT models. + + Examples + ---------- + gnn_model = GAT(10, 20, num_heads=5), + bgnn = BGNNPredictor(gnn_model) + metrics = bgnn.fit(graph, X, y, train_mask, val_mask, test_mask, cat_features) + """ + + def __init__( + self, + gnn_model, + task="regression", + loss_fn=None, + trees_per_epoch=10, + backprop_per_epoch=10, + lr=0.01, + append_gbdt_pred=True, + train_input_features=False, + gbdt_depth=6, + gbdt_lr=0.1, + gbdt_alpha=1, + random_seed=0, + ): + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + self.model = gnn_model.to(self.device) + self.task = task + self.loss_fn = loss_fn + self.trees_per_epoch = trees_per_epoch + self.backprop_per_epoch = backprop_per_epoch + self.lr = lr + self.append_gbdt_pred = append_gbdt_pred + self.train_input_features = train_input_features + self.gbdt_depth = gbdt_depth + self.gbdt_lr = gbdt_lr + self.gbdt_alpha = gbdt_alpha + self.random_seed = random_seed + torch.manual_seed(random_seed) + np.random.seed(random_seed) + + def init_gbdt_model(self, num_epochs, epoch): + if self.task == "regression": + catboost_model_obj = CatBoostRegressor + catboost_loss_fn = "RMSE" + else: + if epoch == 0: # we predict multiclass probs at first epoch + catboost_model_obj = CatBoostClassifier + catboost_loss_fn = "MultiClass" + else: # we predict the gradients for each class at epochs > 0 + catboost_model_obj = CatBoostRegressor + catboost_loss_fn = "MultiRMSE" + + return catboost_model_obj( + iterations=num_epochs, + depth=self.gbdt_depth, + learning_rate=self.gbdt_lr, + loss_function=catboost_loss_fn, + random_seed=self.random_seed, + nan_mode="Min", + ) + + def fit_gbdt(self, pool, trees_per_epoch, epoch): + gbdt_model = self.init_gbdt_model(trees_per_epoch, epoch) + gbdt_model.fit(pool, verbose=False) + return gbdt_model + + def append_gbdt_model(self, new_gbdt_model, weights): + if self.gbdt_model is None: + return new_gbdt_model + return sum_models([self.gbdt_model, new_gbdt_model], weights=weights) + + def train_gbdt( + self, + gbdt_X_train, + gbdt_y_train, + cat_features, + epoch, + gbdt_trees_per_epoch, + gbdt_alpha, + ): + pool = Pool(gbdt_X_train, gbdt_y_train, cat_features=cat_features) + epoch_gbdt_model = self.fit_gbdt(pool, gbdt_trees_per_epoch, epoch) + if epoch == 0 and self.task == "classification": + self.base_gbdt = epoch_gbdt_model + else: + self.gbdt_model = self.append_gbdt_model( + epoch_gbdt_model, weights=[1, gbdt_alpha] + ) + + def update_node_features(self, node_features, X, original_X): + # get predictions from gbdt model + if self.task == "regression": + predictions = np.expand_dims(self.gbdt_model.predict(original_X), axis=1) + else: + predictions = self.base_gbdt.predict_proba(original_X) + if self.gbdt_model is not None: + predictions_after_one = self.gbdt_model.predict(original_X) + predictions += predictions_after_one + + # update node features with predictions + if self.append_gbdt_pred: + if self.train_input_features: + predictions = np.append( + node_features.detach().cpu().data[:, : -self.out_dim], + predictions, + axis=1, + ) # replace old predictions with new predictions + else: + predictions = np.append( + X, predictions, axis=1 + ) # append original features with new predictions + + predictions = torch.from_numpy(predictions).to(self.device) + + node_features.data = predictions.float().data + + def update_gbdt_targets(self, node_features, node_features_before, train_mask): + return ( + (node_features - node_features_before) + .detach() + .cpu() + .numpy()[train_mask, -self.out_dim :] + ) + + def init_node_features(self, X): + node_features = torch.empty( + X.shape[0], self.in_dim, requires_grad=True, device=self.device + ) + if self.append_gbdt_pred: + node_features.data[:, : -self.out_dim] = torch.from_numpy( + X.to_numpy(copy=True) + ) + return node_features + + def init_optimizer(self, node_features, optimize_node_features, learning_rate): + params = [self.model.parameters()] + if optimize_node_features: + params.append([node_features]) + optimizer = torch.optim.Adam(itertools.chain(*params), lr=learning_rate) + return optimizer + + def train_model(self, model_in, target_labels, train_mask, optimizer): + y = target_labels[train_mask] + + self.model.train() + logits = self.model(*model_in).squeeze() + pred = logits[train_mask] + + if self.loss_fn is not None: + loss = self.loss_fn(pred, y) + else: + if self.task == "regression": + loss = torch.sqrt(F.mse_loss(pred, y)) + elif self.task == "classification": + loss = F.cross_entropy(pred, y.long()) + else: + raise NotImplemented( + "Unknown task. Supported tasks: classification, regression." + ) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + return loss + + def evaluate_model(self, logits, target_labels, mask): + metrics = {} + y = target_labels[mask] + with torch.no_grad(): + pred = logits[mask] + if self.task == "regression": + metrics["loss"] = torch.sqrt(F.mse_loss(pred, y).squeeze() + 1e-8) + metrics["rmsle"] = torch.sqrt( + F.mse_loss(torch.log(pred + 1), torch.log(y + 1)).squeeze() + 1e-8 + ) + metrics["mae"] = F.l1_loss(pred, y) + metrics["r2"] = torch.Tensor( + [r2_score(y.cpu().numpy(), pred.cpu().numpy())] + ) + elif self.task == "classification": + metrics["loss"] = F.cross_entropy(pred, y.long()) + metrics["accuracy"] = torch.Tensor( + [(y == pred.max(1)[1]).sum().item() / y.shape[0]] + ) + + return metrics + + def train_and_evaluate( + self, + model_in, + target_labels, + train_mask, + val_mask, + test_mask, + optimizer, + metrics, + gnn_passes_per_epoch, + ): + loss = None + + for _ in range(gnn_passes_per_epoch): + loss = self.train_model(model_in, target_labels, train_mask, optimizer) + + self.model.eval() + logits = self.model(*model_in).squeeze() + train_results = self.evaluate_model(logits, target_labels, train_mask) + val_results = self.evaluate_model(logits, target_labels, val_mask) + test_results = self.evaluate_model(logits, target_labels, test_mask) + for metric_name in train_results: + metrics[metric_name].append( + ( + train_results[metric_name].detach().item(), + val_results[metric_name].detach().item(), + test_results[metric_name].detach().item(), + ) + ) + return loss + + def update_early_stopping( + self, + metrics, + epoch, + best_metric, + best_val_epoch, + epochs_since_last_best_metric, + metric_name, + lower_better=False, + ): + train_metric, val_metric, test_metric = metrics[metric_name][-1] + if (lower_better and val_metric < best_metric[1]) or ( + not lower_better and val_metric > best_metric[1] + ): + best_metric = metrics[metric_name][-1] + best_val_epoch = epoch + epochs_since_last_best_metric = 0 + else: + epochs_since_last_best_metric += 1 + return best_metric, best_val_epoch, epochs_since_last_best_metric + + def log_epoch( + self, + pbar, + metrics, + epoch, + loss, + epoch_time, + logging_epochs, + metric_name="loss", + ): + train_metric, val_metric, test_metric = metrics[metric_name][-1] + if epoch and epoch % logging_epochs == 0: + pbar.set_description( + "Epoch {:05d} | Loss {:.3f} | Loss {:.3f}/{:.3f}/{:.3f} | Time {:.4f}".format( + epoch, + loss, + train_metric, + val_metric, + test_metric, + epoch_time, + ) + ) + + def fit( + self, + graph, + X, + y, + train_mask, + val_mask, + test_mask, + original_X=None, + cat_features=None, + num_epochs=100, + patience=10, + logging_epochs=1, + metric_name="loss", + ): + """ + + :param graph : dgl.DGLGraph + Input graph + :param X : pd.DataFrame + Input node features. Each column represents one input feature. Each row is a node. + Values in dataframe are numerical, after preprocessing. + :param y : pd.DataFrame + Input node targets. Each column represents one target. Each row is a node + (order of nodes should be the same as in X). + :param train_mask : list[int] + Node indexes (rows) that belong to train set. + :param val_mask : list[int] + Node indexes (rows) that belong to validation set. + :param test_mask : list[int] + Node indexes (rows) that belong to test set. + :param original_X : pd.DataFrame, optional + Input node features before preprocessing. Each column represents one input feature. Each row is a node. + Values in dataframe can be of any type, including categorical (e.g. string, bool) or + missing values (None). This is useful if you want to preprocess X with GBDT model. + :param cat_features: list[int] + Feature indexes (columns) which are categorical features. + :param num_epochs : int + Number of epochs to run. + :param patience : int + Number of epochs to wait until early stopping. + :param logging_epochs : int + Log every n epoch. + :param metric_name : str + Metric to use for early stopping. + :param normalize_features : bool + If to normalize original input features X (column wise). + :param replace_na: bool + If to replace missing values (None) in X. + :return: metrics evaluated during training + """ + + # initialize for early stopping and metrics + if metric_name in ["r2", "accuracy"]: + best_metric = [np.cfloat("-inf")] * 3 # for train/val/test + else: + best_metric = [np.cfloat("inf")] * 3 # for train/val/test + + best_val_epoch = 0 + epochs_since_last_best_metric = 0 + metrics = ddict(list) + if cat_features is None: + cat_features = [] + + if self.task == "regression": + self.out_dim = y.shape[1] + elif self.task == "classification": + self.out_dim = len(set(y.iloc[test_mask, 0])) + self.in_dim = ( + self.out_dim + X.shape[1] if self.append_gbdt_pred else self.out_dim + ) + + if original_X is None: + original_X = X.copy() + cat_features = [] + + gbdt_X_train = original_X.iloc[train_mask] + gbdt_y_train = y.iloc[train_mask] + gbdt_alpha = self.gbdt_alpha + self.gbdt_model = None + + node_features = self.init_node_features(X) + optimizer = self.init_optimizer( + node_features, optimize_node_features=True, learning_rate=self.lr + ) + + y = torch.from_numpy(y.to_numpy(copy=True)).float().squeeze().to(self.device) + graph = graph.to(self.device) + + pbar = tqdm(range(num_epochs)) + for epoch in pbar: + start2epoch = time.time() + + # gbdt part + self.train_gbdt( + gbdt_X_train, + gbdt_y_train, + cat_features, + epoch, + self.trees_per_epoch, + gbdt_alpha, + ) + + self.update_node_features(node_features, X, original_X) + node_features_before = node_features.clone() + model_in = (graph, node_features) + loss = self.train_and_evaluate( + model_in, + y, + train_mask, + val_mask, + test_mask, + optimizer, + metrics, + self.backprop_per_epoch, + ) + gbdt_y_train = self.update_gbdt_targets( + node_features, node_features_before, train_mask + ) + + self.log_epoch( + pbar, + metrics, + epoch, + loss, + time.time() - start2epoch, + logging_epochs, + metric_name=metric_name, + ) + + # check early stopping + ( + best_metric, + best_val_epoch, + epochs_since_last_best_metric, + ) = self.update_early_stopping( + metrics, + epoch, + best_metric, + best_val_epoch, + epochs_since_last_best_metric, + metric_name, + lower_better=(metric_name not in ["r2", "accuracy"]), + ) + if patience and epochs_since_last_best_metric > patience: + break + + if np.isclose(gbdt_y_train.sum(), 0.0): + print("Node embeddings do not change anymore. Stopping...") + break + + print( + "Best {} at iteration {}: {:.3f}/{:.3f}/{:.3f}".format( + metric_name, best_val_epoch, *best_metric + ) + ) + return metrics + + def predict(self, graph, X, test_mask): + graph = graph.to(self.device) + node_features = torch.empty(X.shape[0], self.in_dim).to(self.device) + self.update_node_features(node_features, X, X) + logits = self.model(graph, node_features).squeeze() + if self.task == "regression": + return logits[test_mask] + else: + return logits[test_mask].max(1)[1] + + def plot_interactive( + self, + metrics, + legend, + title, + logx=False, + logy=False, + metric_name="loss", + start_from=0, + ): + import plotly.graph_objects as go + + metric_results = metrics[metric_name] + xs = [list(range(len(metric_results)))] * len(metric_results[0]) + ys = list(zip(*metric_results)) + + fig = go.Figure() + for i in range(len(ys)): + fig.add_trace( + go.Scatter( + x=xs[i][start_from:], + y=ys[i][start_from:], + mode="lines+markers", + name=legend[i], + ) + ) + + fig.update_layout( + title=title, + title_x=0.5, + xaxis_title="Epoch", + yaxis_title=metric_name, + font=dict( + size=40, + ), + height=600, + ) + + if logx: + fig.update_layout(xaxis_type="log") + if logy: + fig.update_layout(yaxis_type="log") + + fig.show() + + +class GNNModelDGL(torch.nn.Module): + def __init__( + self, + in_dim, + hidden_dim, + out_dim, + dropout=0.0, + name="gat", + residual=True, + use_mlp=False, + join_with_mlp=False, + ): + super(GNNModelDGL, self).__init__() + self.name = name + self.use_mlp = use_mlp + self.join_with_mlp = join_with_mlp + self.normalize_input_columns = True + if name == "gat": + self.l1 = GATConvDGL( + in_dim, + hidden_dim // 8, + 8, + feat_drop=dropout, + attn_drop=dropout, + residual=False, + activation=F.elu, + ) + self.l2 = GATConvDGL( + hidden_dim, + out_dim, + 1, + feat_drop=dropout, + attn_drop=dropout, + residual=residual, + activation=None, + ) + elif name == "gcn": + self.l1 = GraphConv(in_dim, hidden_dim, activation=F.elu) + self.l2 = GraphConv(hidden_dim, out_dim, activation=F.elu) + self.drop = Dropout(p=dropout) + elif name == "cheb": + self.l1 = ChebConvDGL(in_dim, hidden_dim, k=3) + self.l2 = ChebConvDGL(hidden_dim, out_dim, k=3) + self.drop = Dropout(p=dropout) + elif name == "agnn": + self.lin1 = Sequential( + Dropout(p=dropout), Linear(in_dim, hidden_dim), ELU() + ) + self.l1 = AGNNConvDGL(learn_beta=False) + self.l2 = AGNNConvDGL(learn_beta=True) + self.lin2 = Sequential( + Dropout(p=dropout), Linear(hidden_dim, out_dim), ELU() + ) + elif name == "appnp": + self.lin1 = Sequential( + Dropout(p=dropout), + Linear(in_dim, hidden_dim), + ReLU(), + Dropout(p=dropout), + Linear(hidden_dim, out_dim), + ) + self.l1 = APPNPConv(k=10, alpha=0.1, edge_drop=0.0) + + def forward(self, graph, features): + h = features + if self.use_mlp: + if self.join_with_mlp: + h = torch.cat((h, self.mlp(features)), 1) + else: + h = self.mlp(features) + if self.name == "gat": + h = self.l1(graph, h).flatten(1) + logits = self.l2(graph, h).mean(1) + elif self.name in ["appnp"]: + h = self.lin1(h) + logits = self.l1(graph, h) + elif self.name == "agnn": + h = self.lin1(h) + h = self.l1(graph, h) + h = self.l2(graph, h) + logits = self.lin2(h) + elif self.name == "che3b": + lambda_max = dgl.laplacian_lambda_max(graph) + h = self.drop(h) + h = self.l1(graph, h, lambda_max) + logits = self.l2(graph, h, lambda_max) + elif self.name == "gcn": + h = self.drop(h) + h = self.l1(graph, h) + logits = self.l2(graph, h) + + return logits + + +def read_input(input_folder): + X = pd.read_csv(f"{input_folder}/X.csv") + y = pd.read_csv(f"{input_folder}/y.csv") + + categorical_columns = [] + if os.path.exists(f"{input_folder}/cat_features.txt"): + with open(f"{input_folder}/cat_features.txt") as f: + for line in f: + if line.strip(): + categorical_columns.append(line.strip()) + + cat_features = None + if categorical_columns: + columns = X.columns + cat_features = np.where(columns.isin(categorical_columns))[0] + + for col in list(columns[cat_features]): + X[col] = X[col].astype(str) + + gs, _ = load_graphs(f"{input_folder}/graph.dgl") + graph = gs[0] + + with open(f"{input_folder}/masks.json") as f: + masks = json.load(f) + + return graph, X, y, cat_features, masks + + +def normalize_features(X, train_mask, val_mask, test_mask): + min_max_scaler = preprocessing.MinMaxScaler() + A = X.to_numpy(copy=True) + A[train_mask] = min_max_scaler.fit_transform(A[train_mask]) + A[val_mask + test_mask] = min_max_scaler.transform(A[val_mask + test_mask]) + return pd.DataFrame(A, columns=X.columns).astype(float) + + +def replace_na(X, train_mask): + if X.isna().any().any(): + return X.fillna(X.iloc[train_mask].min() - 1) + return X + + +def encode_cat_features(X, y, cat_features, train_mask, val_mask, test_mask): + enc = CatBoostEncoder() + A = X.to_numpy(copy=True) + b = y.to_numpy(copy=True) + A[np.ix_(train_mask, cat_features)] = enc.fit_transform( + A[np.ix_(train_mask, cat_features)], b[train_mask] + ) + A[np.ix_(val_mask + test_mask, cat_features)] = enc.transform( + A[np.ix_(val_mask + test_mask, cat_features)] + ) + A = A.astype(float) + return pd.DataFrame(A, columns=X.columns) + + +def convert_data(g): + retrieved_tensor = g.ndata["feat"] + retrieved_np = retrieved_tensor.numpy() + retrieved_str = retrieved_np.astype(str) + X = pd.DataFrame(retrieved_str) + + retrieved_y_tensor = g.ndata["class"] + retrieved_y_np = retrieved_y_tensor.numpy() + y = pd.DataFrame(retrieved_y_np) + + retrieved_cat_features_tensor = g.ndata["cat_features"][0] + cat_features = retrieved_cat_features_tensor.numpy() + + train_mask = g.ndata["train_mask"].numpy().tolist() + val_mask = g.ndata["val_mask"].numpy().tolist() + test_mask = g.ndata["test_mask"].numpy().tolist() + masks = { + "0": { + "train": [i for i, v in enumerate(train_mask) if v == 1], + "val": [i for i, v in enumerate(val_mask) if v == 1], + "test": [i for i, v in enumerate(test_mask) if v == 1], + } + } + + # graph, X, y, cat_features, masks = read_input(input_folder) + train_mask, val_mask, test_mask = ( + masks["0"]["train"], + masks["0"]["val"], + masks["0"]["test"], + ) + + return X, y, cat_features, train_mask, val_mask, test_mask diff --git a/hugegraph-ml/src/hugegraph_ml/models/bgrl.py b/hugegraph-ml/src/hugegraph_ml/models/bgrl.py new file mode 100644 index 00000000..6d991ee2 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/models/bgrl.py @@ -0,0 +1,260 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License.right (c) 2024 by jinsong, All Rights Reserved. + +""" +Bootstrapped Graph Latents (BGRL) + +References +---------- +Paper: https://arxiv.org/abs/2102.06514 +Author's code: https://github.com/nerdslab/bgrl +DGL code: https://github.com/dmlc/dgl/tree/master/examples/pytorch/bgrl +""" + +import copy + +import dgl + +import torch +from dgl.nn.pytorch.conv import GraphConv, SAGEConv +from torch import nn +from torch.nn import BatchNorm1d, Parameter +from torch.nn.init import ones_, zeros_ +from dgl.transforms import Compose, DropEdge, FeatMask +from torch.nn.functional import cosine_similarity + +class MLP_Predictor(nn.Module): + r"""MLP used for predictor. The MLP has one hidden layer. + Args: + input_size (int): Size of input features. + output_size (int): Size of output features. + hidden_size (int, optional): Size of hidden layer. (default: :obj:`4096`). + """ + + def __init__(self, input_size, output_size, hidden_size=512): + super().__init__() + + self.net = nn.Sequential( + nn.Linear(input_size, hidden_size, bias=True), + nn.PReLU(1), + nn.Linear(hidden_size, output_size, bias=True), + ) + self.reset_parameters() + + def forward(self, x): + return self.net(x) + + def reset_parameters(self): + # kaiming_uniform + for m in self.modules(): + if isinstance(m, nn.Linear): + m.reset_parameters() + + +class GCN(nn.Module): + def __init__(self, layer_sizes, batch_norm_mm=0.99): + super(GCN, self).__init__() + + self.layers = nn.ModuleList() + for in_dim, out_dim in zip(layer_sizes[:-1], layer_sizes[1:]): + self.layers.append(GraphConv(in_dim, out_dim)) + self.layers.append(BatchNorm1d(out_dim, momentum=batch_norm_mm)) + self.layers.append(nn.PReLU()) + + def forward(self, g, feats): + x = feats + for layer in self.layers: + if isinstance(layer, GraphConv): + x = layer(g, x) + else: + x = layer(x) + return x + + def reset_parameters(self): + for layer in self.layers: + if hasattr(layer, "reset_parameters"): + layer.reset_parameters() + +class BGRL(nn.Module): + r"""BGRL architecture for Graph representation learning. + Args: + encoder (torch.nn.Module): Encoder network to be duplicated and used in both online and target networks. + predictor (torch.nn.Module): Predictor network used to predict the target projection from the online projection. + .. note:: + `encoder` must have a `reset_parameters` method, as the weights of the target network will be initialized + differently from the online network. + """ + + def __init__(self, encoder, predictor): + super(BGRL, self).__init__() + # online network + self.online_encoder = encoder + self.predictor = predictor + + # target network + self.target_encoder = copy.deepcopy(encoder) + + # reinitialize weights + self.target_encoder.reset_parameters() + + # stop gradient + for param in self.target_encoder.parameters(): + param.requires_grad = False + + def trainable_parameters(self): + r"""Returns the parameters that will be updated via an optimizer.""" + return list(self.online_encoder.parameters()) + list( + self.predictor.parameters() + ) + + @torch.no_grad() + def update_target_network(self, mm): + r"""Performs a momentum update of the target network's weights. + Args: + mm (float): Momentum used in moving average update. + """ + for param_q, param_k in zip( + self.online_encoder.parameters(), self.target_encoder.parameters() + ): + param_k.data.mul_(mm).add_(param_q.data, alpha=1.0 - mm) + + def forward(self, graph, feat): + transform_1 = get_graph_drop_transform( + drop_edge_p=0.3, feat_mask_p=0.3 + ) + transform_2 = get_graph_drop_transform( + drop_edge_p=0.2, feat_mask_p=0.4 + ) + online_x = transform_1(graph) + target_x = transform_2(graph) + online_x, target_x = dgl.add_self_loop(online_x), dgl.add_self_loop(target_x) + online_feats, target_feats = online_x.ndata["feat"], target_x.ndata["feat"] + + # forward online network + online_y1 = self.online_encoder(online_x, online_feats) + # prediction + online_q1 = self.predictor(online_y1) + # forward target network + with torch.no_grad(): + target_y1 = self.target_encoder(target_x, target_feats).detach() + + # forward online network 2 + online_y2 = self.online_encoder(target_x, target_feats) + # prediction + online_q2 = self.predictor(online_y2) + # forward target network + with torch.no_grad(): + target_y2 = self.target_encoder(online_x, online_feats).detach() + + loss = ( + 2 + - cosine_similarity(online_q1, target_y1.detach(), dim=-1).mean() + - cosine_similarity(online_q2, target_y2.detach(), dim=-1).mean() + ) + return loss + + def get_embedding(self, graph, feats): + """ + Get the node embeddings from the encoder without computing gradients. + + Parameters + ---------- + graph : dgl.DGLGraph + The input graph. + feats : torch.Tensor + Node features. + + Returns + ------- + torch.Tensor + Node embeddings. + """ + h = self.target_encoder(graph, feats) # Encode the node features with GCN + return h.detach() # Detach from computation graph for evaluation + +def compute_representations(net, dataset, device): + r"""Pre-computes the representations for the entire data. + Returns: + [torch.Tensor, torch.Tensor]: Representations and labels. + """ + net.eval() + reps = [] + labels = [] + + if len(dataset) == 1: + g = dataset[0] + g = dgl.add_self_loop(g) + g = g.to(device) + with torch.no_grad(): + reps.append(net(g)) + labels.append(g.ndata["label"]) + else: + for g in dataset: + # forward + g = g.to(device) + with torch.no_grad(): + reps.append(net(g)) + labels.append(g.ndata["label"]) + + reps = torch.cat(reps, dim=0) + labels = torch.cat(labels, dim=0) + return [reps, labels] + +class CosineDecayScheduler: + def __init__(self, max_val, warmup_steps, total_steps): + self.max_val = max_val + self.warmup_steps = warmup_steps + self.total_steps = total_steps + + def get(self, step): + if step < self.warmup_steps: + return self.max_val * step / self.warmup_steps + elif self.warmup_steps <= step <= self.total_steps: + return ( + self.max_val + * ( + 1 + + np.cos( + (step - self.warmup_steps) + * np.pi + / (self.total_steps - self.warmup_steps) + ) + ) + / 2 + ) + else: + raise ValueError( + "Step ({}) > total number of steps ({}).".format( + step, self.total_steps + ) + ) + +def get_graph_drop_transform(drop_edge_p, feat_mask_p): + transforms = list() + + # make copy of graph + transforms.append(copy.deepcopy) + + # drop edges + if drop_edge_p > 0.0: + transforms.append(DropEdge(drop_edge_p)) + + # drop features + if feat_mask_p > 0.0: + transforms.append(FeatMask(feat_mask_p, node_feat_names=["feat"])) + + return Compose(transforms) \ No newline at end of file diff --git a/hugegraph-ml/src/hugegraph_ml/models/care_gnn.py b/hugegraph-ml/src/hugegraph_ml/models/care_gnn.py new file mode 100644 index 00000000..7046569a --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/models/care_gnn.py @@ -0,0 +1,232 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +CAmouflage-REsistant GNN (CARE-GNN) + +References +---------- +Paper: https://arxiv.org/abs/2008.08692 +Author's code: https://github.com/YingtongDou/CARE-GNN +DGL code: https://github.com/dmlc/dgl/tree/master/examples/pytorch/caregnn +""" + +import dgl.function as fn +import numpy as np +import torch as th +import torch.nn as nn + + +class CAREConv(nn.Module): + """One layer of CARE-GNN.""" + + def __init__( + self, + in_dim, + out_dim, + num_classes, + edges, + activation=None, + step_size=0.02, + ): + super(CAREConv, self).__init__() + + self.activation = activation + self.step_size = step_size + self.in_dim = in_dim + self.out_dim = out_dim + self.num_classes = num_classes + self.edges = edges + self.dist = {} + + self.linear = nn.Linear(self.in_dim, self.out_dim) + self.MLP = nn.Linear(self.in_dim, self.num_classes) + + self.p = {} + self.last_avg_dist = {} + self.f = {} + self.cvg = {} + for etype in edges: + self.p[etype] = 0.5 + self.last_avg_dist[etype] = 0 + self.f[etype] = [] + self.cvg[etype] = False + + def _calc_distance(self, edges): + # formula 2 + d = th.norm( + th.tanh(self.MLP(edges.src["h"])) - th.tanh(self.MLP(edges.dst["h"])), + 1, + 1, + ) + return {"d": d} + + def _top_p_sampling(self, g, p): + # this implementation is low efficient + # optimization requires dgl.sampling.select_top_p requested in issue #3100 + dist = g.edata["d"] + neigh_list = [] + for node in g.nodes(): + edges = g.in_edges(node, form="eid") + num_neigh = th.ceil(g.in_degrees(node) * p).int().item() + neigh_dist = dist[edges] + if neigh_dist.shape[0] > num_neigh: + neigh_index = np.argpartition(neigh_dist.cpu().detach(), num_neigh)[ + :num_neigh + ] + else: + neigh_index = np.arange(num_neigh) + neigh_list.append(edges[neigh_index]) + return th.cat(neigh_list) + + def forward(self, g, feat): + with g.local_scope(): + g.ndata["h"] = feat + + hr = {} + for i, etype in enumerate(g.canonical_etypes): + g.apply_edges(self._calc_distance, etype=etype) + self.dist[etype] = g.edges[etype].data["d"] + sampled_edges = self._top_p_sampling(g[etype], self.p[etype]) + + # formula 8 + g.send_and_recv( + sampled_edges, + fn.copy_u("h", "m"), + fn.mean("m", "h_%s" % etype[1]), + etype=etype, + ) + hr[etype] = g.ndata["h_%s" % etype[1]] + if self.activation is not None: + hr[etype] = self.activation(hr[etype]) + + # formula 9 using mean as inter-relation aggregator + p_tensor = th.Tensor(list(self.p.values())).view(-1, 1, 1).to(g.device) + h_homo = th.sum(th.stack(list(hr.values())) * p_tensor, dim=0) + h_homo += feat + if self.activation is not None: + h_homo = self.activation(h_homo) + + return self.linear(h_homo) + + +class CAREGNN(nn.Module): + def __init__( + self, + in_dim, + num_classes, + hid_dim=64, + edges=None, + num_layers=2, + activation=None, + step_size=0.02, + ): + super(CAREGNN, self).__init__() + self.in_dim = in_dim + self.hid_dim = hid_dim + self.num_classes = num_classes + self.edges = edges + self.activation = activation + self.step_size = step_size + self.num_layers = num_layers + + self.layers = nn.ModuleList() + + if self.num_layers == 1: + # Single layer + self.layers.append( + CAREConv( + self.in_dim, + self.num_classes, + self.num_classes, + self.edges, + activation=self.activation, + step_size=self.step_size, + ) + ) + + else: + # Input layer + self.layers.append( + CAREConv( + self.in_dim, + self.hid_dim, + self.num_classes, + self.edges, + activation=self.activation, + step_size=self.step_size, + ) + ) + + # Hidden layers with n - 2 layers + for i in range(self.num_layers - 2): + self.layers.append( + CAREConv( + self.hid_dim, + self.hid_dim, + self.num_classes, + self.edges, + activation=self.activation, + step_size=self.step_size, + ) + ) + + # Output layer + self.layers.append( + CAREConv( + self.hid_dim, + self.num_classes, + self.num_classes, + self.edges, + activation=self.activation, + step_size=self.step_size, + ) + ) + + def forward(self, graph, feat): + # For full graph training, directly use the graph + # formula 4 + sim = th.tanh(self.layers[0].MLP(feat)) + + # Forward of n layers of CARE-GNN + for layer in self.layers: + feat = layer(graph, feat) + + return feat, sim + + def RLModule(self, graph, epoch, idx): + for layer in self.layers: + for etype in self.edges: + if not layer.cvg[etype]: + # formula 5 + eid = graph.in_edges(idx, form="eid", etype=etype) + avg_dist = th.mean(layer.dist[etype][eid]) + + # formula 6 + if layer.last_avg_dist[etype] < avg_dist: + if layer.p[etype] - self.step_size > 0: + layer.p[etype] -= self.step_size + layer.f[etype].append(-1) + else: + if layer.p[etype] + self.step_size <= 1: + layer.p[etype] += self.step_size + layer.f[etype].append(+1) + layer.last_avg_dist[etype] = avg_dist + + # formula 7 + if epoch >= 9 and abs(sum(layer.f[etype][-10:])) <= 2: + layer.cvg[etype] = True diff --git a/hugegraph-ml/src/hugegraph_ml/models/cluster_gcn.py b/hugegraph-ml/src/hugegraph_ml/models/cluster_gcn.py new file mode 100644 index 00000000..ce4fb944 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/models/cluster_gcn.py @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Cluster-GCN + +References +---------- +Paper: https://arxiv.org/abs/1905.07953 +Author's code: https://github.com/google-research/google-research/tree/master/cluster_gcn +DGL code: https://github.com/dmlc/dgl/tree/master/examples/pytorch/cluster_gcn +""" + +import dgl +import dgl.nn as dglnn +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchmetrics.functional as MF + + +class SAGE(nn.Module): + def __init__(self, in_feats, n_hidden, n_classes): + super().__init__() + self.layers = nn.ModuleList() + self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, "mean")) + self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, "mean")) + self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, "mean")) + self.dropout = nn.Dropout(0.5) + + def forward(self, sg, x): + h = x + for l, layer in enumerate(self.layers): + h = layer(sg, h) + if l != len(self.layers) - 1: + h = F.relu(h) + h = self.dropout(h) + return h + + def loss(self, logits, labels): + return nn.CrossEntropyLoss()(logits, labels) + + def inference(self, sg, x): + return self.forward(sg, x) diff --git a/hugegraph-ml/src/hugegraph_ml/models/correct_and_smooth.py b/hugegraph-ml/src/hugegraph_ml/models/correct_and_smooth.py new file mode 100644 index 00000000..34f8ec75 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/models/correct_and_smooth.py @@ -0,0 +1,262 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" + Correct and Smooth (C&S) + +References +---------- +Paper: https://arxiv.org/abs/2010.13993 +Author's code: https://github.com/CUAI/CorrectAndSmooth +DGL code: https://github.com/dmlc/dgl/tree/master/examples/pytorch/correct_and_smooth +""" + +import dgl.function as fn +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class MLPLinear(nn.Module): + def __init__(self, in_dim, out_dim): + super(MLPLinear, self).__init__() + self.linear = nn.Linear(in_dim, out_dim) + self.reset_parameters() + self.criterion = nn.CrossEntropyLoss() + + def reset_parameters(self): + self.linear.reset_parameters() + + def forward(self, graph, x): + return F.log_softmax(self.linear(x), dim=-1) + + def loss(self, logits, labels): + return self.criterion(logits, labels) + + def inference(self, graph, feats): + return self.forward(graph, feats) + + +class MLP(nn.Module): + def __init__(self, in_dim, hid_dim, out_dim, num_layers, dropout=0.0): + super(MLP, self).__init__() + assert num_layers >= 2 + + self.linears = nn.ModuleList() + self.bns = nn.ModuleList() + self.linears.append(nn.Linear(in_dim, hid_dim)) + self.bns.append(nn.BatchNorm1d(hid_dim)) + + for _ in range(num_layers - 2): + self.linears.append(nn.Linear(hid_dim, hid_dim)) + self.bns.append(nn.BatchNorm1d(hid_dim)) + + self.linears.append(nn.Linear(hid_dim, out_dim)) + self.dropout = dropout + self.reset_parameters() + + self.criterion = nn.CrossEntropyLoss() + + def reset_parameters(self): + for layer in self.linears: + layer.reset_parameters() + for layer in self.bns: + layer.reset_parameters() + + def forward(self, graph, x): + for linear, bn in zip(self.linears[:-1], self.bns): + x = linear(x) + x = F.relu(x, inplace=True) + x = bn(x) + x = F.dropout(x, p=self.dropout, training=self.training) + x = self.linears[-1](x) + return F.log_softmax(x, dim=-1) + + def loss(self, logits, labels): + return self.criterion(logits, labels) + + def inference(self, graph, feats): + return self.forward(graph, feats) + + +class LabelPropagation(nn.Module): + r""" + + Description + ----------- + Introduced in `Learning from Labeled and Unlabeled Data with Label Propagation `_ + + .. math:: + \mathbf{Y}^{\prime} = \alpha \cdot \mathbf{D}^{-1/2} \mathbf{A} + \mathbf{D}^{-1/2} \mathbf{Y} + (1 - \alpha) \mathbf{Y}, + + where unlabeled data is inferred by labeled data via propagation. + + Parameters + ---------- + num_layers: int + The number of propagations. + alpha: float + The :math:`\alpha` coefficient. + adj: str + 'DAD': D^-0.5 * A * D^-0.5 + 'DA': D^-1 * A + 'AD': A * D^-1 + """ + + def __init__(self, num_layers, alpha, adj="DAD"): + super(LabelPropagation, self).__init__() + + self.num_layers = num_layers + self.alpha = alpha + self.adj = adj + + @torch.no_grad() + def forward(self, g, labels, mask=None, post_step=lambda y: y.clamp_(0.0, 1.0)): + with g.local_scope(): + if labels.dtype == torch.long: + labels = F.one_hot(labels.view(-1)).to(torch.float32) + + y = labels + if mask is not None: + y = torch.zeros_like(labels) + y[mask] = labels[mask] + + last = (1 - self.alpha) * y + degs = g.in_degrees().float().clamp(min=1) + norm = ( + torch.pow(degs, -0.5 if self.adj == "DAD" else -1) + .to(labels.device) + .unsqueeze(1) + ) + + for _ in range(self.num_layers): + # Assume the graphs to be undirected + if self.adj in ["DAD", "AD"]: + y = norm * y + + g.ndata["h"] = y + g.update_all(fn.copy_u("h", "m"), fn.sum("m", "h")) + y = self.alpha * g.ndata.pop("h") + + if self.adj in ["DAD", "DA"]: + y = y * norm + + y = post_step(last + y) + + return y + + +class CorrectAndSmooth(nn.Module): + r""" + + Description + ----------- + Introduced in `Combining Label Propagation and Simple Models Out-performs Graph Neural Networks `_ + + Parameters + ---------- + num_correction_layers: int + The number of correct propagations. + correction_alpha: float + The coefficient of correction. + correction_adj: str + 'DAD': D^-0.5 * A * D^-0.5 + 'DA': D^-1 * A + 'AD': A * D^-1 + num_smoothing_layers: int + The number of smooth propagations. + smoothing_alpha: float + The coefficient of smoothing. + smoothing_adj: str + 'DAD': D^-0.5 * A * D^-0.5 + 'DA': D^-1 * A + 'AD': A * D^-1 + autoscale: bool, optional + If set to True, will automatically determine the scaling factor :math:`\sigma`. Default is True. + scale: float, optional + The scaling factor :math:`\sigma`, in case :obj:`autoscale = False`. Default is 1. + """ + + def __init__( + self, + num_correction_layers, + correction_alpha, + correction_adj, + num_smoothing_layers, + smoothing_alpha, + smoothing_adj, + autoscale=True, + scale=1.0, + ): + super(CorrectAndSmooth, self).__init__() + + self.autoscale = autoscale + self.scale = scale + + self.prop1 = LabelPropagation( + num_correction_layers, correction_alpha, correction_adj + ) + self.prop2 = LabelPropagation( + num_smoothing_layers, smoothing_alpha, smoothing_adj + ) + + def correct(self, g, y_soft, y_true, mask): + with g.local_scope(): + assert abs(float(y_soft.sum()) / y_soft.size(0) - 1.0) < 1e-2 + numel = int(mask.sum()) if mask.dtype == torch.bool else mask.size(0) + assert y_true.size(0) == numel + + if y_true.dtype == torch.long: + y_true = F.one_hot(y_true.view(-1), y_soft.size(-1)).to(y_soft.dtype) + + error = torch.zeros_like(y_soft) + error[mask] = y_true - y_soft[mask] + + if self.autoscale: + smoothed_error = self.prop1( + g, error, post_step=lambda x: x.clamp_(-1.0, 1.0) + ) + sigma = error[mask].abs().sum() / numel + scale = sigma / smoothed_error.abs().sum(dim=1, keepdim=True) + scale[scale.isinf() | (scale > 1000)] = 1.0 + + result = y_soft + scale * smoothed_error + result[result.isnan()] = y_soft[result.isnan()] + return result + else: + + def fix_input(x): + x[mask] = error[mask] + return x + + smoothed_error = self.prop1(g, error, post_step=fix_input) + + result = y_soft + self.scale * smoothed_error + result[result.isnan()] = y_soft[result.isnan()] + return result + + def smooth(self, g, y_soft, y_true, mask): + with g.local_scope(): + numel = int(mask.sum()) if mask.dtype == torch.bool else mask.size(0) + assert y_true.size(0) == numel + + if y_true.dtype == torch.long: + y_true = F.one_hot(y_true.view(-1), y_soft.size(-1)).to(y_soft.dtype) + + y_soft[mask] = y_true + return self.prop2(g, y_soft) diff --git a/hugegraph-ml/src/hugegraph_ml/models/dagnn.py b/hugegraph-ml/src/hugegraph_ml/models/dagnn.py new file mode 100644 index 00000000..2e9d260f --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/models/dagnn.py @@ -0,0 +1,145 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Deep Adaptive Graph Neural Network (DAGNN) + +References +---------- +Paper: https://arxiv.org/abs/2007.09296 +Author's code: https://github.com/divelab/DeeperGNN +DGL code: https://github.com/dmlc/dgl/tree/master/examples/pytorch/dagnn +""" + +import dgl.function as fn + +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F, Parameter +import random + + +class DAGNNConv(nn.Module): + def __init__(self, in_dim, k): + super(DAGNNConv, self).__init__() + + self.s = Parameter(torch.FloatTensor(in_dim, 1)) + self.k = k + + self.reset_parameters() + + def reset_parameters(self): + gain = nn.init.calculate_gain("sigmoid") + nn.init.xavier_uniform_(self.s, gain=gain) + + def forward(self, graph, feats): + with graph.local_scope(): + results = [feats] + + degs = graph.in_degrees().float() + norm = torch.pow(degs, -0.5) + norm = norm.to(feats.device).unsqueeze(1) + + for _ in range(self.k): + feats = feats * norm + graph.ndata["h"] = feats + graph.update_all(fn.copy_u("h", "m"), fn.sum("m", "h")) + feats = graph.ndata["h"] + feats = feats * norm + results.append(feats) + + H = torch.stack(results, dim=1) + S = F.sigmoid(torch.matmul(H, self.s)) + S = S.permute(0, 2, 1) + H = torch.matmul(S, H).squeeze() + + return H + + +class MLPLayer(nn.Module): + def __init__(self, in_dim, out_dim, bias=True, activation=None, dropout=0): + super(MLPLayer, self).__init__() + + self.linear = nn.Linear(in_dim, out_dim, bias=bias) + self.activation = activation + self.dropout = nn.Dropout(dropout) + self.reset_parameters() + + def reset_parameters(self): + gain = 1.0 + if self.activation is F.relu: + gain = nn.init.calculate_gain("relu") + nn.init.xavier_uniform_(self.linear.weight, gain=gain) + if self.linear.bias is not None: + nn.init.zeros_(self.linear.bias) + + def forward(self, feats): + feats = self.dropout(feats) + feats = self.linear(feats) + if self.activation: + feats = self.activation(feats) + + return feats + + +class DAGNN(nn.Module): + def __init__( + self, + k, + in_dim, + hid_dim, + out_dim, + bias=True, + activation=F.relu, + dropout=0, + ): + super(DAGNN, self).__init__() + self.mlp = nn.ModuleList() + self.mlp.append( + MLPLayer( + in_dim=in_dim, + out_dim=hid_dim, + bias=bias, + activation=activation, + dropout=dropout, + ) + ) + self.mlp.append( + MLPLayer( + in_dim=hid_dim, + out_dim=out_dim, + bias=bias, + activation=None, + dropout=dropout, + ) + ) + self.dagnn = DAGNNConv(in_dim=out_dim, k=k) + + self.criterion = nn.CrossEntropyLoss() + + def forward(self, graph, feats): + for layer in self.mlp: + feats = layer(feats) + feats = self.dagnn(graph, feats) + return feats + + def loss(self, logits, labels): + return self.criterion(logits, labels) + + def inference(self, graph, feats): + return self.forward(graph, feats) diff --git a/hugegraph-ml/src/hugegraph_ml/models/deepergcn.py b/hugegraph-ml/src/hugegraph_ml/models/deepergcn.py new file mode 100644 index 00000000..03fc2927 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/models/deepergcn.py @@ -0,0 +1,287 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +DeeperGCN + +References +---------- +Paper: https://arxiv.org/abs/2006.07739 +Author's code: https://github.com/lightaime/deep_gcns_torch +DGL code: https://github.com/dmlc/dgl/tree/master/examples/pytorch/deepergcn +""" + +import dgl.function as fn +import torch.nn as nn +import torch.nn.functional as F +from dgl.nn.functional import edge_softmax +from dgl.nn.pytorch.glob import AvgPooling +from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder +import torch + + +class DeeperGCN(nn.Module): + r""" + + Description + ----------- + Introduced in "DeeperGCN: All You Need to Train Deeper GCNs " + + Parameters + ---------- + node_feat_dim: int + Size of node feature. + edge_feat_dim: int + Size of edge feature. + hid_dim: int + Size of hidden representations. + out_dim: int + Size of output. + num_layers: int + Number of graph convolutional layers. + dropout: float + Dropout rate. Default is 0. + beta: float + A continuous variable called an inverse temperature. Default is 1.0. + learn_beta: bool + Whether beta is a learnable weight. Default is False. + aggr: str + Type of aggregation. Default is 'softmax'. + mlp_layers: int + Number of MLP layers in message normalization. Default is 1. + """ + + def __init__( + self, + node_feat_dim, + edge_feat_dim, + hid_dim, + out_dim, + num_layers, + dropout=0.0, + beta=1.0, + learn_beta=False, + aggr="softmax", + mlp_layers=1, + ): + super(DeeperGCN, self).__init__() + + self.num_layers = num_layers + self.dropout = dropout + self.gcns = nn.ModuleList() + self.norms = nn.ModuleList() + + for _ in range(self.num_layers): + conv = GENConv( + edge_feat_dim=edge_feat_dim, + in_dim=hid_dim, + out_dim=hid_dim, + aggregator=aggr, + beta=beta, + learn_beta=learn_beta, + mlp_layers=mlp_layers, + ) + + self.gcns.append(conv) + self.norms.append(nn.BatchNorm1d(hid_dim, affine=True)) + + # self.node_encoder = AtomEncoder(hid_dim) + self.node_encoder = torch.nn.Sequential( + torch.nn.Linear(node_feat_dim, 512), + torch.nn.ReLU(), + torch.nn.Linear(512, hid_dim), + ) + # self.pooling = AvgPooling() + self.output = nn.Linear(hid_dim, out_dim) + + self.criterion = nn.CrossEntropyLoss() + + def forward(self, g, edge_feats, node_feats=None): + with g.local_scope(): + hv = self.node_encoder(node_feats.float()) + he = edge_feats + + for layer in range(self.num_layers): + hv1 = self.norms[layer](hv) + hv1 = F.relu(hv1) + hv1 = F.dropout(hv1, p=self.dropout, training=self.training) + hv = self.gcns[layer](g, hv1, he) + hv + + # h_g = self.pooling(g, hv) + + return self.output(hv) + + def loss(self, logits, labels): + return self.criterion(logits, labels) + + def inference(self, g, edge_feats, node_feats): + return self.forward(g, edge_feats, node_feats) + + +class GENConv(nn.Module): + r""" + + Description + ----------- + Generalized Message Aggregator was introduced in "DeeperGCN: All You Need to Train Deeper GCNs " + + Parameters + ---------- + in_dim: int + Input size. + out_dim: int + Output size. + aggregator: str + Type of aggregation. Default is 'softmax'. + beta: float + A continuous variable called an inverse temperature. Default is 1.0. + learn_beta: bool + Whether beta is a learnable variable or not. Default is False. + p: float + Initial power for power mean aggregation. Default is 1.0. + learn_p: bool + Whether p is a learnable variable or not. Default is False. + msg_norm: bool + Whether message normalization is used. Default is False. + learn_msg_scale: bool + Whether s is a learnable scaling factor or not in message normalization. Default is False. + mlp_layers: int + The number of MLP layers. Default is 1. + eps: float + A small positive constant in message construction function. Default is 1e-7. + """ + + def __init__( + self, + edge_feat_dim, + in_dim, + out_dim, + aggregator="softmax", + beta=1.0, + learn_beta=False, + p=1.0, + learn_p=False, + msg_norm=False, + learn_msg_scale=False, + mlp_layers=1, + eps=1e-7, + ): + super(GENConv, self).__init__() + + self.aggr = aggregator + self.eps = eps + + channels = [in_dim] + for _ in range(mlp_layers - 1): + channels.append(in_dim * 2) + channels.append(out_dim) + + self.mlp = MLP(channels) + self.msg_norm = MessageNorm(learn_msg_scale) if msg_norm else None + + self.beta = ( + nn.Parameter(torch.Tensor([beta]), requires_grad=True) + if learn_beta and self.aggr == "softmax" + else beta + ) + self.p = nn.Parameter(torch.Tensor([p]), requires_grad=True) if learn_p else p + + # self.edge_encoder = BondEncoder(in_dim) + self.edge_encoder = torch.nn.Sequential( + torch.nn.Linear(edge_feat_dim, 512), + torch.nn.ReLU(), + torch.nn.Linear(512, in_dim), + ) + + def forward(self, g, node_feats, edge_feats): + with g.local_scope(): + # Node and edge feature size need to match. + g.ndata["h"] = node_feats + g.edata["h"] = self.edge_encoder(edge_feats.float()) + g.apply_edges(fn.u_add_e("h", "h", "m")) + + if self.aggr == "softmax": + g.edata["m"] = F.relu(g.edata["m"]) + self.eps + g.edata["a"] = edge_softmax(g, g.edata["m"] * self.beta) + g.update_all( + lambda edge: {"x": edge.data["m"] * edge.data["a"]}, + fn.sum("x", "m"), + ) + + elif self.aggr == "power": + minv, maxv = 1e-7, 1e1 + torch.clamp_(g.edata["m"], minv, maxv) + g.update_all( + lambda edge: {"x": torch.pow(edge.data["m"], self.p)}, + fn.mean("x", "m"), + ) + torch.clamp_(g.ndata["m"], minv, maxv) + g.ndata["m"] = torch.pow(g.ndata["m"], self.p) + + else: + raise NotImplementedError(f"Aggregator {self.aggr} is not supported.") + + if self.msg_norm is not None: + g.ndata["m"] = self.msg_norm(node_feats, g.ndata["m"]) + + feats = node_feats + g.ndata["m"] + + return self.mlp(feats) + + +class MLP(nn.Sequential): + r""" + + Description + ----------- + From equation (5) in "DeeperGCN: All You Need to Train Deeper GCNs " + """ + + def __init__(self, channels, act="relu", dropout=0.0, bias=True): + layers = [] + + for i in range(1, len(channels)): + layers.append(nn.Linear(channels[i - 1], channels[i], bias)) + if i < len(channels) - 1: + layers.append(nn.BatchNorm1d(channels[i], affine=True)) + layers.append(nn.ReLU()) + layers.append(nn.Dropout(dropout)) + + super(MLP, self).__init__(*layers) + + +class MessageNorm(nn.Module): + r""" + + Description + ----------- + Message normalization was introduced in "DeeperGCN: All You Need to Train Deeper GCNs " + + Parameters + ---------- + learn_scale: bool + Whether s is a learnable scaling factor or not. Default is False. + """ + + def __init__(self, learn_scale=False): + super(MessageNorm, self).__init__() + self.scale = nn.Parameter(torch.FloatTensor([1.0]), requires_grad=learn_scale) + + def forward(self, feats, msg, p=2): + msg = F.normalize(msg, p=2, dim=-1) + feats_norm = feats.norm(p=p, dim=-1, keepdim=True) + return msg * feats_norm * self.scale diff --git a/hugegraph-ml/src/hugegraph_ml/models/gatne.py b/hugegraph-ml/src/hugegraph_ml/models/gatne.py new file mode 100644 index 00000000..794d1ccb --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/models/gatne.py @@ -0,0 +1,273 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License.right (c) 2024 by jinsong, All Rights Reserved. + +""" +General Attributed Multiplex HeTerogeneous Network Embedding (GATNE) + +References +---------- +Paper: https://arxiv.org/abs/1905.01669 +Author's code: https://github.com/THUDM/GATNE +DGL code: https://github.com/dmlc/dgl/tree/master/examples/pytorch/GATNE-T +""" + +import math +import os +import sys +import time +from collections import defaultdict + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from numpy import random +from torch.nn.parameter import Parameter +from tqdm.auto import tqdm + +import dgl +import dgl.function as fn +import multiprocessing +from functools import partial, reduce, wraps + + +class NeighborSampler(object): + def __init__(self, g, num_fanouts): + self.g = g + self.num_fanouts = num_fanouts + + def sample(self, pairs): + heads, tails, types = zip(*pairs) + seeds, head_invmap = torch.unique(torch.LongTensor(heads), return_inverse=True) + blocks = [] + for fanout in reversed(self.num_fanouts): + sampled_graph = dgl.sampling.sample_neighbors(self.g, seeds, fanout) + sampled_block = dgl.to_block(sampled_graph, seeds) + seeds = sampled_block.srcdata[dgl.NID] + blocks.insert(0, sampled_block) + return ( + blocks, + torch.LongTensor(head_invmap), + torch.LongTensor(tails), + torch.LongTensor(types), + ) + + +class DGLGATNE(nn.Module): + def __init__( + self, + num_nodes, + embedding_size, + embedding_u_size, + edge_types, + edge_type_count, + dim_a, + ): + super(DGLGATNE, self).__init__() + self.num_nodes = num_nodes + self.embedding_size = embedding_size + self.embedding_u_size = embedding_u_size + self.edge_types = edge_types + self.edge_type_count = edge_type_count + self.dim_a = dim_a + + self.node_embeddings = Parameter(torch.FloatTensor(num_nodes, embedding_size)) + self.node_type_embeddings = Parameter( + torch.FloatTensor(num_nodes, edge_type_count, embedding_u_size) + ) + self.trans_weights = Parameter( + torch.FloatTensor(edge_type_count, embedding_u_size, embedding_size) + ) + self.trans_weights_s1 = Parameter( + torch.FloatTensor(edge_type_count, embedding_u_size, dim_a) + ) + self.trans_weights_s2 = Parameter(torch.FloatTensor(edge_type_count, dim_a, 1)) + + self.reset_parameters() + + def reset_parameters(self): + self.node_embeddings.data.uniform_(-1.0, 1.0) + self.node_type_embeddings.data.uniform_(-1.0, 1.0) + self.trans_weights.data.normal_(std=1.0 / math.sqrt(self.embedding_size)) + self.trans_weights_s1.data.normal_(std=1.0 / math.sqrt(self.embedding_size)) + self.trans_weights_s2.data.normal_(std=1.0 / math.sqrt(self.embedding_size)) + + # embs: [batch_size, embedding_size] + def forward(self, block): + input_nodes = block.srcdata[dgl.NID] + output_nodes = block.dstdata[dgl.NID] + batch_size = block.number_of_dst_nodes() + node_embed = self.node_embeddings + node_type_embed = [] + + with block.local_scope(): + for i in range(self.edge_type_count): + edge_type = self.edge_types[i] + block.srcdata[edge_type] = self.node_type_embeddings[input_nodes, i] + block.dstdata[edge_type] = self.node_type_embeddings[output_nodes, i] + block.update_all( + fn.copy_u(edge_type, "m"), + fn.sum("m", edge_type), + etype=edge_type, + ) + node_type_embed.append(block.dstdata[edge_type]) + + node_type_embed = torch.stack(node_type_embed, 1) + tmp_node_type_embed = node_type_embed.unsqueeze(2).view( + -1, 1, self.embedding_u_size + ) + trans_w = ( + self.trans_weights.unsqueeze(0) + .repeat(batch_size, 1, 1, 1) + .view(-1, self.embedding_u_size, self.embedding_size) + ) + trans_w_s1 = ( + self.trans_weights_s1.unsqueeze(0) + .repeat(batch_size, 1, 1, 1) + .view(-1, self.embedding_u_size, self.dim_a) + ) + trans_w_s2 = ( + self.trans_weights_s2.unsqueeze(0) + .repeat(batch_size, 1, 1, 1) + .view(-1, self.dim_a, 1) + ) + + attention = ( + F.softmax( + torch.matmul( + torch.tanh(torch.matmul(tmp_node_type_embed, trans_w_s1)), + trans_w_s2, + ) + .squeeze(2) + .view(-1, self.edge_type_count), + dim=1, + ) + .unsqueeze(1) + .repeat(1, self.edge_type_count, 1) + ) + + node_type_embed = torch.matmul(attention, node_type_embed).view( + -1, 1, self.embedding_u_size + ) + node_embed = node_embed[output_nodes].unsqueeze(1).repeat( + 1, self.edge_type_count, 1 + ) + torch.matmul(node_type_embed, trans_w).view( + -1, self.edge_type_count, self.embedding_size + ) + last_node_embed = F.normalize(node_embed, dim=2) + + return last_node_embed # [batch_size, edge_type_count, embedding_size] + + +class NSLoss(nn.Module): + def __init__(self, num_nodes, num_sampled, embedding_size): + super(NSLoss, self).__init__() + self.num_nodes = num_nodes + self.num_sampled = num_sampled + self.embedding_size = embedding_size + self.weights = Parameter(torch.FloatTensor(num_nodes, embedding_size)) + # [ (log(i+2) - log(i+1)) / log(num_nodes + 1)] + self.sample_weights = F.normalize( + torch.Tensor( + [ + (math.log(k + 2) - math.log(k + 1)) / math.log(num_nodes + 1) + for k in range(num_nodes) + ] + ), + dim=0, + ) + + self.reset_parameters() + + def reset_parameters(self): + self.weights.data.normal_(std=1.0 / math.sqrt(self.embedding_size)) + + def forward(self, input, embs, label): + n = input.shape[0] + log_target = torch.log( + torch.sigmoid(torch.sum(torch.mul(embs, self.weights[label]), 1)) + ) + negs = torch.multinomial( + self.sample_weights, self.num_sampled * n, replacement=True + ).view(n, self.num_sampled) + noise = torch.neg(self.weights[negs]) + sum_log_sampled = torch.sum( + torch.log(torch.sigmoid(torch.bmm(noise, embs.unsqueeze(2)))), 1 + ).squeeze() + + loss = log_target + sum_log_sampled + return -loss.sum() / n + + +def generate_pairs_parallel(walks, skip_window=None, layer_id=None): + pairs = [] + for walk in walks: + walk = walk.tolist() + for i in range(len(walk)): + for j in range(1, skip_window + 1): + if i - j >= 0: + pairs.append((walk[i], walk[i - j], layer_id)) + if i + j < len(walk): + pairs.append((walk[i], walk[i + j], layer_id)) + return pairs + + +def generate_pairs(all_walks, window_size, num_workers): + # for each node, choose the first neighbor and second neighbor of it to form pairs + # Get all worker processes + start_time = time.time() + print("We are generating pairs with {} cores.".format(num_workers)) + + # Start all worker processes + pool = multiprocessing.Pool(processes=num_workers) + pairs = [] + skip_window = window_size // 2 + for layer_id, walks in enumerate(all_walks): + block_num = len(walks) // num_workers + if block_num > 0: + walks_list = [ + walks[i * block_num : min((i + 1) * block_num, len(walks))] + for i in range(num_workers) + ] + else: + walks_list = [walks] + tmp_result = pool.map( + partial( + generate_pairs_parallel, + skip_window=skip_window, + layer_id=layer_id, + ), + walks_list, + ) + pairs += reduce(lambda x, y: x + y, tmp_result) + + pool.close() + end_time = time.time() + print("Generate pairs end, use {}s.".format(end_time - start_time)) + return np.array([list(pair) for pair in set(pairs)]) + + +def construct_typenodes_from_graph(graph): + nodes = [] + for etype in graph.etypes: + edges = graph.edges(etype=etype) + node1, node2 = edges + node1_list = node1.cpu().numpy().tolist() + node2_list = node2.cpu().numpy().tolist() + tmp_nodes = list(set(node1_list + node2_list)) + nodes.append(tmp_nodes) + return nodes diff --git a/hugegraph-ml/src/hugegraph_ml/models/pgnn.py b/hugegraph-ml/src/hugegraph_ml/models/pgnn.py new file mode 100644 index 00000000..de0f7408 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/models/pgnn.py @@ -0,0 +1,462 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Position-aware Graph Neural Networks (P-GNN) + +References +---------- +Paper: http://proceedings.mlr.press/v97/you19b/you19b.pdf +Author's code: https://github.com/JiaxuanYou/P-GNN +DGL code: https://github.com/dmlc/dgl/tree/master/examples/pytorch/P-GNN +""" + +import dgl.function as fn +import torch +import torch.nn as nn +import torch.nn.functional as F +import multiprocessing as mp +import random +from multiprocessing import get_context + +import networkx as nx +import numpy as np +from tqdm.auto import tqdm +from sklearn.metrics import roc_auc_score + + +class PGNN_layer(nn.Module): + def __init__(self, input_dim, output_dim): + super(PGNN_layer, self).__init__() + self.input_dim = input_dim + + self.linear_hidden_u = nn.Linear(input_dim, output_dim) + self.linear_hidden_v = nn.Linear(input_dim, output_dim) + self.linear_out_position = nn.Linear(output_dim, 1) + self.act = nn.ReLU() + + def forward(self, graph, feature, anchor_eid, dists_max): + with graph.local_scope(): + u_feat = self.linear_hidden_u(feature) + v_feat = self.linear_hidden_v(feature) + graph.srcdata.update({"u_feat": u_feat}) + graph.dstdata.update({"v_feat": v_feat}) + + graph.apply_edges(fn.u_mul_e("u_feat", "sp_dist", "u_message")) + graph.apply_edges(fn.v_add_e("v_feat", "u_message", "message")) + + messages = torch.index_select( + graph.edata["message"], + 0, + torch.LongTensor(anchor_eid).to(feature.device), + ) + messages = messages.reshape( + dists_max.shape[0], dists_max.shape[1], messages.shape[-1] + ) + + messages = self.act(messages) # n*m*d + + out_position = self.linear_out_position(messages).squeeze(-1) # n*m_out + out_structure = torch.mean(messages, dim=1) # n*d + + return out_position, out_structure + + +class PGNN(nn.Module): + def __init__(self, input_dim, feature_dim=32, dropout=0.5): + super(PGNN, self).__init__() + self.dropout = nn.Dropout(dropout) + + self.linear_pre = nn.Linear(input_dim, feature_dim) + self.conv_first = PGNN_layer(feature_dim, feature_dim) + self.conv_out = PGNN_layer(feature_dim, feature_dim) + + def forward(self, data): + x = data["graph"].ndata["feat"] + graph = data["graph"] + x = self.linear_pre(x) + x_position, x = self.conv_first(graph, x, data["anchor_eid"], data["dists_max"]) + + x = self.dropout(x) + x_position, x = self.conv_out(graph, x, data["anchor_eid"], data["dists_max"]) + x_position = F.normalize(x_position, p=2, dim=-1) + return x_position + + +def get_communities(remove_feature, graph): + community_size = 20 + # Randomly rewire 1% edges + node_list = list(graph.nodes) + for u, v in graph.edges(): + if random.random() < 0.01: + x = random.choice(node_list) + if graph.has_edge(u, x): + continue + graph.remove_edge(u, v) + graph.add_edge(u, x) + + # remove self-loops + graph.remove_edges_from(nx.selfloop_edges(graph)) + edge_index = np.array(list(graph.edges)) + # Add (i, j) for an edge (j, i) + edge_index = np.concatenate((edge_index, edge_index[:, ::-1]), axis=0) + edge_index = torch.from_numpy(edge_index).long().permute(1, 0) + + n = graph.number_of_nodes() + label = np.zeros((n, n), dtype=int) + for u in node_list: + # the node IDs are simply consecutive integers from 0 + for v in range(u): + if u // community_size == v // community_size: + label[u, v] = 1 + + if remove_feature: + feature = torch.ones((n, 1)) + else: + rand_order = np.random.permutation(n) + feature = np.identity(n)[:, rand_order] + + data = { + "edge_index": edge_index, + "feature": feature, + "positive_edges": np.stack(np.nonzero(label)), + "num_nodes": feature.shape[0], + } + + return data + + +def to_single_directed(edges): + edges_new = np.zeros((2, edges.shape[1] // 2), dtype=int) + j = 0 + for i in range(edges.shape[1]): + if edges[0, i] < edges[1, i]: + edges_new[:, j] = edges[:, i] + j += 1 + + return edges_new + + +# each node at least remain in the new graph +def split_edges(p, edges, data, non_train_ratio=0.2): + e = edges.shape[1] + edges = edges[:, np.random.permutation(e)] + split1 = int((1 - non_train_ratio) * e) + split2 = int((1 - non_train_ratio / 2) * e) + + data.update( + { + "{}_edges_train".format(p): edges[:, :split1], # 80% + "{}_edges_val".format(p): edges[:, split1:split2], # 10% + "{}_edges_test".format(p): edges[:, split2:], # 10% + } + ) + + +def to_bidirected(edges): + return np.concatenate((edges, edges[::-1, :]), axis=-1) + + +def get_negative_edges(positive_edges, num_nodes, num_negative_edges): + positive_edge_set = [] + positive_edges = to_bidirected(positive_edges) + for i in range(positive_edges.shape[1]): + positive_edge_set.append(tuple(positive_edges[:, i])) + positive_edge_set = set(positive_edge_set) + + negative_edges = np.zeros((2, num_negative_edges), dtype=positive_edges.dtype) + for i in range(num_negative_edges): + while True: + mask_temp = tuple(np.random.choice(num_nodes, size=(2,), replace=False)) + if mask_temp not in positive_edge_set: + negative_edges[:, i] = mask_temp + break + + return negative_edges + + +def get_pos_neg_edges(data, infer_link_positive=True): + if infer_link_positive: + data["positive_edges"] = to_single_directed(data["edge_index"].numpy()) + split_edges("positive", data["positive_edges"], data) + + # resample edge mask link negative + negative_edges = get_negative_edges( + data["positive_edges"], + data["num_nodes"], + num_negative_edges=data["positive_edges"].shape[1], + ) + split_edges("negative", negative_edges, data) + + return data + + +def shortest_path(graph, node_range, cutoff): + dists_dict = {} + for node in tqdm(node_range, leave=False): + dists_dict[node] = nx.single_source_shortest_path_length(graph, node, cutoff) + return dists_dict + + +def merge_dicts(dicts): + result = {} + for dictionary in dicts: + result.update(dictionary) + return result + + +def all_pairs_shortest_path(graph, cutoff=None, num_workers=4): + nodes = list(graph.nodes) + random.shuffle(nodes) + pool = mp.Pool(processes=num_workers) + interval_size = len(nodes) / num_workers + results = [ + pool.apply_async( + shortest_path, + args=( + graph, + nodes[int(interval_size * i) : int(interval_size * (i + 1))], + cutoff, + ), + ) + for i in range(num_workers) + ] + output = [p.get() for p in results] + dists_dict = merge_dicts(output) + pool.close() + pool.join() + return dists_dict + + +def precompute_dist_data(edge_index, num_nodes, approximate=0): + """ + Here dist is 1/real_dist, higher actually means closer, 0 means disconnected + :return: + """ + graph = nx.Graph() + edge_list = edge_index.transpose(1, 0).tolist() + graph.add_edges_from(edge_list) + + n = num_nodes + dists_array = np.zeros((n, n)) + dists_dict = all_pairs_shortest_path( + graph, cutoff=approximate if approximate > 0 else None + ) + node_list = graph.nodes() + for node_i in node_list: + shortest_dist = dists_dict[node_i] + for node_j in node_list: + dist = shortest_dist.get(node_j, -1) + if dist != -1: + dists_array[node_i, node_j] = 1 / (dist + 1) + return dists_array + + +def get_dataset(graph): + # Generate graph data + data_info = get_communities(False, graph) + # Get positive and negative edges + data = get_pos_neg_edges(data_info, infer_link_positive=True) + # Pre-compute shortest path length + dists_removed = precompute_dist_data( + data["positive_edges_train"], + data["num_nodes"], + approximate=-1, + ) + data["dists"] = torch.from_numpy(dists_removed).float() + data["edge_index"] = torch.from_numpy( + to_bidirected(data["positive_edges_train"]) + ).long() + + return data + + +def get_anchors(n): + """Get a list of NumPy arrays, each of them is an anchor node set""" + m = int(np.log2(n)) + anchor_set_id = [] + for i in range(m): + anchor_size = int(n / np.exp2(i + 1)) + for _ in range(m): + anchor_set_id.append(np.random.choice(n, size=anchor_size, replace=False)) + return anchor_set_id + + +def get_dist_max(anchor_set_id, dist): + # N x K, N is number of nodes, K is the number of anchor sets + dist_max = torch.zeros((dist.shape[0], len(anchor_set_id))) + dist_argmax = torch.zeros((dist.shape[0], len(anchor_set_id))).long() + for i in range(len(anchor_set_id)): + temp_id = torch.as_tensor(anchor_set_id[i], dtype=torch.long) + # Get reciprocal of shortest distance to each node in the i-th anchor set + dist_temp = torch.index_select(dist, 1, temp_id) + # For each node in the graph, find its closest anchor node in the set + # and the reciprocal of shortest distance + dist_max_temp, dist_argmax_temp = torch.max(dist_temp, dim=-1) + dist_max[:, i] = dist_max_temp + dist_argmax[:, i] = torch.index_select(temp_id, 0, dist_argmax_temp) + return dist_max, dist_argmax + + +def get_a_graph(dists_max, dists_argmax): + src = [] + dst = [] + real_src = [] + real_dst = [] + edge_weight = [] + dists_max = dists_max.numpy() + for i in range(dists_max.shape[0]): + # Get unique closest anchor nodes for node i across all anchor sets + tmp_dists_argmax, tmp_dists_argmax_idx = np.unique(dists_argmax[i, :], True) + src.extend([i] * tmp_dists_argmax.shape[0]) + real_src.extend([i] * dists_argmax[i, :].shape[0]) + real_dst.extend(list(dists_argmax[i, :].numpy())) + dst.extend(list(tmp_dists_argmax)) + edge_weight.extend(dists_max[i, tmp_dists_argmax_idx].tolist()) + eid_dict = {(u, v): i for i, (u, v) in enumerate(list(zip(dst, src)))} + anchor_eid = [eid_dict.get((u, v)) for u, v in zip(real_dst, real_src)] + g = (dst, src) + return g, anchor_eid, edge_weight + + +def get_graphs(data, anchor_sets): + graphs = [] + anchor_eids = [] + dists_max_list = [] + edge_weights = [] + for anchor_set in tqdm(anchor_sets, leave=False): + dists_max, dists_argmax = get_dist_max(anchor_set, data["dists"]) + g, anchor_eid, edge_weight = get_a_graph(dists_max, dists_argmax) + graphs.append(g) + anchor_eids.append(anchor_eid) + dists_max_list.append(dists_max) + edge_weights.append(edge_weight) + + return graphs, anchor_eids, dists_max_list, edge_weights + + +def merge_result(outputs): + graphs = [] + anchor_eids = [] + dists_max_list = [] + edge_weights = [] + + for g, anchor_eid, dists_max, edge_weight in outputs: + graphs.extend(g) + anchor_eids.extend(anchor_eid) + dists_max_list.extend(dists_max) + edge_weights.extend(edge_weight) + + return graphs, anchor_eids, dists_max_list, edge_weights + + +def preselect_anchor(data, num_workers=4): + pool = get_context("spawn").Pool(processes=num_workers) + # Pre-compute anchor sets, a collection of anchor sets per epoch + anchor_set_ids = [get_anchors(data["num_nodes"]) for _ in range(200)] + interval_size = len(anchor_set_ids) / num_workers + results = [ + pool.apply_async( + get_graphs, + args=( + data, + anchor_set_ids[int(interval_size * i) : int(interval_size * (i + 1))], + ), + ) + for i in range(num_workers) + ] + + output = [p.get() for p in results] + graphs, anchor_eids, dists_max_list, edge_weights = merge_result(output) + pool.close() + pool.join() + + return graphs, anchor_eids, dists_max_list, edge_weights + + +def get_loss(p, data, out, loss_func, device, get_auc=True): + edge_mask = np.concatenate( + ( + data["positive_edges_{}".format(p)], + data["negative_edges_{}".format(p)], + ), + axis=-1, + ) + + nodes_first = torch.index_select( + out, 0, torch.from_numpy(edge_mask[0, :]).long().to(out.device) + ) + nodes_second = torch.index_select( + out, 0, torch.from_numpy(edge_mask[1, :]).long().to(out.device) + ) + + pred = torch.sum(nodes_first * nodes_second, dim=-1) + + label_positive = torch.ones( + [ + data["positive_edges_{}".format(p)].shape[1], + ], + dtype=pred.dtype, + ) + label_negative = torch.zeros( + [ + data["negative_edges_{}".format(p)].shape[1], + ], + dtype=pred.dtype, + ) + label = torch.cat((label_positive, label_negative)).to(device) + loss = loss_func(pred, label) + + if get_auc: + auc = roc_auc_score( + label.flatten().cpu().numpy(), + torch.sigmoid(pred).flatten().data.cpu().numpy(), + ) + return loss, auc + else: + return loss + + +def train_model(data, model, loss_func, optimizer, device, g_data): + model.train() + out = model(g_data) + + loss = get_loss("train", data, out, loss_func, device, get_auc=False) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + optimizer.zero_grad() + + return g_data + + +def eval_model(data, g_data, model, loss_func, device): + model.eval() + out = model(g_data) + + # train loss and auc + tmp_loss, auc_train = get_loss("train", data, out, loss_func, device) + loss_train = tmp_loss.cpu().data.numpy() + + # val loss and auc + _, auc_val = get_loss("val", data, out, loss_func, device) + + # test loss and auc + _, auc_test = get_loss("test", data, out, loss_func, device) + + return loss_train, auc_train, auc_val, auc_test diff --git a/hugegraph-ml/src/hugegraph_ml/models/seal.py b/hugegraph-ml/src/hugegraph_ml/models/seal.py new file mode 100644 index 00000000..09060ed8 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/models/seal.py @@ -0,0 +1,826 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +SEAL + +References +---------- +Paper: https://arxiv.org/abs/1802.09691 +Author's code: https://github.com/muhanzhang/SEAL +DGL code: https://github.com/dmlc/dgl/tree/master/examples/pytorch/seal +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from dgl.nn.pytorch import GraphConv, SAGEConv, SortPooling, SumPooling +import argparse + +import dgl + +import numpy as np +import pandas as pd +from ogb.linkproppred import DglLinkPropPredDataset, Evaluator +from scipy.sparse.csgraph import shortest_path + +import os.path as osp +from copy import deepcopy + +from dgl import add_self_loop, DGLGraph, NID +from dgl.dataloading.negative_sampler import Uniform +from torch.utils.data import DataLoader, Dataset +from tqdm import tqdm + +import logging +import os +import time + + +class GCN(nn.Module): + """ + GCN Model + + Attributes: + num_layers(int): num of gcn layers + hidden_units(int): num of hidden units + gcn_type(str): type of gcn layer, 'gcn' for GraphConv and 'sage' for SAGEConv + pooling_type(str): type of graph pooling to get subgraph representation + 'sum' for sum pooling and 'center' for center pooling. + node_attributes(Tensor, optional): node attribute + edge_weights(Tensor, optional): edge weight + node_embedding(Tensor, optional): pre-trained node embedding + use_embedding(bool, optional): whether to use node embedding. Note that if 'use_embedding' is set True + and 'node_embedding' is None, will automatically randomly initialize node embedding. + num_nodes(int, optional): num of nodes + dropout(float, optional): dropout rate + max_z(int, optional): default max vocab size of node labeling, default 1000. + + """ + + def __init__( + self, + num_layers, + hidden_units, + gcn_type="gcn", + pooling_type="sum", + node_attributes=None, + edge_weights=None, + node_embedding=None, + use_embedding=False, + num_nodes=None, + dropout=0.5, + max_z=1000, + ): + super(GCN, self).__init__() + self.num_layers = num_layers + self.dropout = dropout + self.pooling_type = pooling_type + self.use_attribute = False if node_attributes is None else True + self.use_embedding = use_embedding + self.use_edge_weight = False if edge_weights is None else True + + self.z_embedding = nn.Embedding(max_z, hidden_units) + if node_attributes is not None: + self.node_attributes_lookup = nn.Embedding.from_pretrained(node_attributes) + self.node_attributes_lookup.weight.requires_grad = False + if edge_weights is not None: + self.edge_weights_lookup = nn.Embedding.from_pretrained(edge_weights) + self.edge_weights_lookup.weight.requires_grad = False + if node_embedding is not None: + self.node_embedding = nn.Embedding.from_pretrained(node_embedding) + self.node_embedding.weight.requires_grad = False + elif use_embedding: + self.node_embedding = nn.Embedding(num_nodes, hidden_units) + + initial_dim = hidden_units + if self.use_attribute: + initial_dim += self.node_attributes_lookup.embedding_dim + if self.use_embedding: + initial_dim += self.node_embedding.embedding_dim + + self.layers = nn.ModuleList() + if gcn_type == "gcn": + self.layers.append( + GraphConv(initial_dim, hidden_units, allow_zero_in_degree=True) + ) + for _ in range(num_layers - 1): + self.layers.append( + GraphConv(hidden_units, hidden_units, allow_zero_in_degree=True) + ) + elif gcn_type == "sage": + self.layers.append( + SAGEConv(initial_dim, hidden_units, aggregator_type="gcn") + ) + for _ in range(num_layers - 1): + self.layers.append( + SAGEConv(hidden_units, hidden_units, aggregator_type="gcn") + ) + else: + raise ValueError("Gcn type error.") + + self.linear_1 = nn.Linear(hidden_units, hidden_units) + self.linear_2 = nn.Linear(hidden_units, 1) + if pooling_type != "sum": + raise ValueError("Pooling type error.") + self.pooling = SumPooling() + + def reset_parameters(self): + for layer in self.layers: + layer.reset_parameters() + + def forward(self, g, z, node_id=None, edge_id=None): + """ + Args: + g(DGLGraph): the graph + z(Tensor): node labeling tensor, shape [N, 1] + node_id(Tensor, optional): node id tensor, shape [N, 1] + edge_id(Tensor, optional): edge id tensor, shape [E, 1] + Returns: + x(Tensor): output tensor + + """ + + z_emb = self.z_embedding(z) + + if self.use_attribute: + x = self.node_attributes_lookup(node_id) + x = torch.cat([z_emb, x], 1) + else: + x = z_emb + + if self.use_edge_weight: + edge_weight = self.edge_weights_lookup(edge_id) + else: + edge_weight = None + + if self.use_embedding: + n_emb = self.node_embedding(node_id) + x = torch.cat([x, n_emb], 1) + + for layer in self.layers[:-1]: + x = layer(g, x, edge_weight=edge_weight) + x = F.relu(x) + x = F.dropout(x, p=self.dropout, training=self.training) + x = self.layers[-1](g, x, edge_weight=edge_weight) + + x = self.pooling(g, x) + x = F.relu(self.linear_1(x)) + F.dropout(x, p=self.dropout, training=self.training) + x = self.linear_2(x) + + return x + + +class DGCNN(nn.Module): + """ + An end-to-end deep learning architecture for graph classification. + paper link: https://muhanzhang.github.io/papers/AAAI_2018_DGCNN.pdf + + Attributes: + num_layers(int): num of gcn layers + hidden_units(int): num of hidden units + k(int, optional): The number of nodes to hold for each graph in SortPooling. + gcn_type(str): type of gcn layer, 'gcn' for GraphConv and 'sage' for SAGEConv + node_attributes(Tensor, optional): node attribute + edge_weights(Tensor, optional): edge weight + node_embedding(Tensor, optional): pre-trained node embedding + use_embedding(bool, optional): whether to use node embedding. Note that if 'use_embedding' is set True + and 'node_embedding' is None, will automatically randomly initialize node embedding. + num_nodes(int, optional): num of nodes + dropout(float, optional): dropout rate + max_z(int, optional): default max vocab size of node labeling, default 1000. + """ + + def __init__( + self, + num_layers, + hidden_units, + k=10, + gcn_type="gcn", + node_attributes=None, + edge_weights=None, + node_embedding=None, + use_embedding=False, + num_nodes=None, + dropout=0.5, + max_z=1000, + ): + super(DGCNN, self).__init__() + self.num_layers = num_layers + self.dropout = dropout + self.use_attribute = False if node_attributes is None else True + self.use_embedding = use_embedding + self.use_edge_weight = False if edge_weights is None else True + + self.z_embedding = nn.Embedding(max_z, hidden_units) + + if node_attributes is not None: + self.node_attributes_lookup = nn.Embedding.from_pretrained(node_attributes) + self.node_attributes_lookup.weight.requires_grad = False + if edge_weights is not None: + self.edge_weights_lookup = nn.Embedding.from_pretrained(edge_weights) + self.edge_weights_lookup.weight.requires_grad = False + if node_embedding is not None: + self.node_embedding = nn.Embedding.from_pretrained(node_embedding) + self.node_embedding.weight.requires_grad = False + elif use_embedding: + self.node_embedding = nn.Embedding(num_nodes, hidden_units) + + initial_dim = hidden_units + if self.use_attribute: + initial_dim += self.node_attributes_lookup.embedding_dim + if self.use_embedding: + initial_dim += self.node_embedding.embedding_dim + + self.layers = nn.ModuleList() + if gcn_type == "gcn": + self.layers.append( + GraphConv(initial_dim, hidden_units, allow_zero_in_degree=True) + ) + for _ in range(num_layers - 1): + self.layers.append( + GraphConv(hidden_units, hidden_units, allow_zero_in_degree=True) + ) + self.layers.append(GraphConv(hidden_units, 1, allow_zero_in_degree=True)) + elif gcn_type == "sage": + self.layers.append( + SAGEConv(initial_dim, hidden_units, aggregator_type="gcn") + ) + for _ in range(num_layers - 1): + self.layers.append( + SAGEConv(hidden_units, hidden_units, aggregator_type="gcn") + ) + self.layers.append(SAGEConv(hidden_units, 1, aggregator_type="gcn")) + else: + raise ValueError("Gcn type error.") + + self.pooling = SortPooling(k=k) + conv1d_channels = [16, 32] + total_latent_dim = hidden_units * num_layers + 1 + conv1d_kws = [total_latent_dim, 5] + self.conv_1 = nn.Conv1d(1, conv1d_channels[0], conv1d_kws[0], conv1d_kws[0]) + self.maxpool1d = nn.MaxPool1d(2, 2) + self.conv_2 = nn.Conv1d( + conv1d_channels[0], conv1d_channels[1], conv1d_kws[1], 1 + ) + dense_dim = int((k - 2) / 2 + 1) + dense_dim = (dense_dim - conv1d_kws[1] + 1) * conv1d_channels[1] + self.linear_1 = nn.Linear(dense_dim, 128) + self.linear_2 = nn.Linear(128, 1) + + def forward(self, g, z, node_id=None, edge_id=None): + """ + Args: + g(DGLGraph): the graph + z(Tensor): node labeling tensor, shape [N, 1] + node_id(Tensor, optional): node id tensor, shape [N, 1] + edge_id(Tensor, optional): edge id tensor, shape [E, 1] + Returns: + x(Tensor): output tensor + """ + z_emb = self.z_embedding(z) + if self.use_attribute: + x = self.node_attributes_lookup(node_id) + x = torch.cat([z_emb, x], 1) + else: + x = z_emb + if self.use_edge_weight: + edge_weight = self.edge_weights_lookup(edge_id) + else: + edge_weight = None + + if self.use_embedding: + n_emb = self.node_embedding(node_id) + x = torch.cat([x, n_emb], 1) + + xs = [x] + for layer in self.layers: + out = torch.tanh(layer(g, xs[-1], edge_weight=edge_weight)) + xs += [out] + + x = torch.cat(xs[1:], dim=-1) + + # SortPooling + x = self.pooling(g, x) + x = x.unsqueeze(1) + x = F.relu(self.conv_1(x)) + x = self.maxpool1d(x) + x = F.relu(self.conv_2(x)) + x = x.view(x.size(0), -1) + + x = F.relu(self.linear_1(x)) + F.dropout(x, p=self.dropout, training=self.training) + x = self.linear_2(x) + + return x + + +def parse_arguments(): + """ + Parse arguments + """ + parser = argparse.ArgumentParser(description="SEAL") + parser.add_argument("--dataset", type=str, default="ogbl-ddi") + parser.add_argument("--gpu_id", type=int, default=0) + parser.add_argument("--hop", type=int, default=1) + parser.add_argument("--model", type=str, default="dgcnn") + parser.add_argument("--gcn_type", type=str, default="gcn") + parser.add_argument("--num_layers", type=int, default=3) + parser.add_argument("--hidden_units", type=int, default=32) + parser.add_argument("--sort_k", type=int, default=30) + parser.add_argument("--pooling", type=str, default="sum") + parser.add_argument("--dropout", type=str, default=0.5) + parser.add_argument("--hits_k", type=int, default=50) + parser.add_argument("--lr", type=float, default=0.0001) + parser.add_argument("--neg_samples", type=int, default=1) + parser.add_argument("--subsample_ratio", type=float, default=0.1) + parser.add_argument("--epochs", type=int, default=60) + parser.add_argument("--batch_size", type=int, default=32) + parser.add_argument("--eval_steps", type=int, default=5) + parser.add_argument("--num_workers", type=int, default=32) + parser.add_argument("--random_seed", type=int, default=2021) + parser.add_argument("--save_dir", type=str, default="./processed") + args = parser.parse_args() + + return args + + +def load_ogb_dataset(dataset): + """ + Load OGB dataset + Args: + dataset(str): name of dataset (ogbl-collab, ogbl-ddi, ogbl-citation) + + Returns: + graph(DGLGraph): graph + split_edge(dict): split edge + + """ + dataset = DglLinkPropPredDataset(name=dataset) + split_edge = dataset.get_edge_split() + graph = dataset[0] + + return graph, split_edge + + +def drnl_node_labeling(subgraph, src, dst): + """ + Double Radius Node labeling + d = r(i,u)+r(i,v) + label = 1+ min(r(i,u),r(i,v))+ (d//2)*(d//2+d%2-1) + Isolated nodes in subgraph will be set as zero. + Extreme large graph may cause memory error. + + Args: + subgraph(DGLGraph): The graph + src(int): node id of one of src node in new subgraph + dst(int): node id of one of dst node in new subgraph + Returns: + z(Tensor): node labeling tensor + """ + adj = subgraph.adj_external().to_dense().numpy() + src, dst = (dst, src) if src > dst else (src, dst) + + idx = list(range(src)) + list(range(src + 1, adj.shape[0])) + adj_wo_src = adj[idx, :][:, idx] + + idx = list(range(dst)) + list(range(dst + 1, adj.shape[0])) + adj_wo_dst = adj[idx, :][:, idx] + + dist2src = shortest_path(adj_wo_dst, directed=False, unweighted=True, indices=src) + dist2src = np.insert(dist2src, dst, 0, axis=0) + dist2src = torch.from_numpy(dist2src) + + dist2dst = shortest_path( + adj_wo_src, directed=False, unweighted=True, indices=dst - 1 + ) + dist2dst = np.insert(dist2dst, src, 0, axis=0) + dist2dst = torch.from_numpy(dist2dst) + + dist = dist2src + dist2dst + dist_over_2, dist_mod_2 = dist // 2, dist % 2 + + z = 1 + torch.min(dist2src, dist2dst) + z += dist_over_2 * (dist_over_2 + dist_mod_2 - 1) + z[src] = 1.0 + z[dst] = 1.0 + z[torch.isnan(z)] = 0.0 + + return z.to(torch.long) + + +def evaluate_hits(name, pos_pred, neg_pred, K): + """ + Compute hits + Args: + name(str): name of dataset + pos_pred(Tensor): predict value of positive edges + neg_pred(Tensor): predict value of negative edges + K(int): num of hits + + Returns: + hits(float): score of hits + + + """ + evaluator = Evaluator(name) + evaluator.K = K + hits = evaluator.eval( + { + "y_pred_pos": pos_pred, + "y_pred_neg": neg_pred, + } + )[f"hits@{K}"] + + return hits + + +class GraphDataSet(Dataset): + """ + GraphDataset for torch DataLoader + """ + + def __init__(self, graph_list, tensor): + self.graph_list = graph_list + self.tensor = tensor + + def __len__(self): + return len(self.graph_list) + + def __getitem__(self, index): + return (self.graph_list[index], self.tensor[index]) + + +class PosNegEdgesGenerator(object): + """ + Generate positive and negative samples + Attributes: + g(dgl.DGLGraph): graph + split_edge(dict): split edge + neg_samples(int): num of negative samples per positive sample + subsample_ratio(float): ratio of subsample + shuffle(bool): if shuffle generated graph list + """ + + def __init__(self, g, split_edge, neg_samples=1, subsample_ratio=0.1, shuffle=True): + self.neg_sampler = Uniform(neg_samples) + self.subsample_ratio = subsample_ratio + self.split_edge = split_edge + self.g = g + self.shuffle = shuffle + + def __call__(self, split_type): + if split_type == "train": + subsample_ratio = self.subsample_ratio + else: + subsample_ratio = 1 + + pos_edges = self.split_edge[split_type]["edge"] + if split_type == "train": + # Adding self loop in train avoids sampling the source node itself. + g = add_self_loop(self.g) + eids = g.edge_ids(pos_edges[:, 0], pos_edges[:, 1]) + neg_edges = torch.stack(self.neg_sampler(g, eids), dim=1) + else: + neg_edges = self.split_edge[split_type]["edge_neg"] + pos_edges = self.subsample(pos_edges, subsample_ratio).long() + neg_edges = self.subsample(neg_edges, subsample_ratio).long() + + edges = torch.cat([pos_edges, neg_edges]) + labels = torch.cat( + [ + torch.ones(pos_edges.size(0), 1), + torch.zeros(neg_edges.size(0), 1), + ] + ) + if self.shuffle: + perm = torch.randperm(edges.size(0)) + edges = edges[perm] + labels = labels[perm] + return edges, labels + + def subsample(self, edges, subsample_ratio): + """ + Subsample generated edges. + Args: + edges(Tensor): edges to subsample + subsample_ratio(float): ratio of subsample + + Returns: + edges(Tensor): edges + + """ + + num_edges = edges.size(0) + perm = torch.randperm(num_edges) + perm = perm[: int(subsample_ratio * num_edges)] + edges = edges[perm] + return edges + + +class EdgeDataSet(Dataset): + """ + Assistant Dataset for speeding up the SEALSampler + """ + + def __init__(self, edges, labels, transform): + self.edges = edges + self.transform = transform + self.labels = labels + + def __len__(self): + return len(self.edges) + + def __getitem__(self, index): + subgraph = self.transform(self.edges[index]) + return (subgraph, self.labels[index]) + + +class SEALSampler(object): + """ + Sampler for SEAL in paper(no-block version) + The strategy is to sample all the k-hop neighbors around the two target nodes. + Attributes: + graph(DGLGraph): The graph + hop(int): num of hop + num_workers(int): num of workers + + """ + + def __init__(self, graph, hop=1, num_workers=32, print_fn=print): + self.graph = graph + self.hop = hop + self.print_fn = print_fn + self.num_workers = num_workers + + def sample_subgraph(self, target_nodes): + """ + Args: + target_nodes(Tensor): Tensor of two target nodes + Returns: + subgraph(DGLGraph): subgraph + """ + sample_nodes = [target_nodes] + frontiers = target_nodes + + for i in range(self.hop): + frontiers = self.graph.out_edges(frontiers)[1] + frontiers = torch.unique(frontiers) + sample_nodes.append(frontiers) + + sample_nodes = torch.cat(sample_nodes) + sample_nodes = torch.unique(sample_nodes) + subgraph = dgl.node_subgraph(self.graph, sample_nodes) + + # Each node should have unique node id in the new subgraph + u_id = int( + torch.nonzero(subgraph.ndata[NID] == int(target_nodes[0]), as_tuple=False) + ) + v_id = int( + torch.nonzero(subgraph.ndata[NID] == int(target_nodes[1]), as_tuple=False) + ) + + # remove link between target nodes in positive subgraphs. + if subgraph.has_edges_between(u_id, v_id): + link_id = subgraph.edge_ids(u_id, v_id, return_uv=True)[2] + subgraph.remove_edges(link_id) + if subgraph.has_edges_between(v_id, u_id): + link_id = subgraph.edge_ids(v_id, u_id, return_uv=True)[2] + subgraph.remove_edges(link_id) + + z = drnl_node_labeling(subgraph, u_id, v_id) + subgraph.ndata["z"] = z + + return subgraph + + def _collate(self, batch): + batch_graphs, batch_labels = map(list, zip(*batch)) + + batch_graphs = dgl.batch(batch_graphs) + batch_labels = torch.stack(batch_labels) + return batch_graphs, batch_labels + + def __call__(self, edges, labels): + subgraph_list = [] + labels_list = [] + edge_dataset = EdgeDataSet(edges, labels, transform=self.sample_subgraph) + self.print_fn("Using {} workers in sampling job.".format(self.num_workers)) + sampler = DataLoader( + edge_dataset, + batch_size=32, + num_workers=self.num_workers, + shuffle=False, + collate_fn=self._collate, + ) + for subgraph, label in tqdm(sampler, ncols=100): + label_copy = deepcopy(label) + subgraph = dgl.unbatch(subgraph) + + del label + subgraph_list += subgraph + labels_list.append(label_copy) + + return subgraph_list, torch.cat(labels_list) + + +class SEALData(object): + """ + 1. Generate positive and negative samples + 2. Subgraph sampling + + Attributes: + g(dgl.DGLGraph): graph + split_edge(dict): split edge + hop(int): num of hop + neg_samples(int): num of negative samples per positive sample + subsample_ratio(float): ratio of subsample + use_coalesce(bool): True for coalesce graph. Graph with multi-edge need to coalesce + """ + + def __init__( + self, + g, + split_edge, + hop=1, + neg_samples=1, + subsample_ratio=1, + prefix=None, + save_dir=None, + num_workers=32, + shuffle=True, + use_coalesce=True, + print_fn=print, + ): + self.g = g + self.hop = hop + self.subsample_ratio = subsample_ratio + self.prefix = prefix + self.save_dir = save_dir + self.print_fn = print_fn + + self.generator = PosNegEdgesGenerator( + g=self.g, + split_edge=split_edge, + neg_samples=neg_samples, + subsample_ratio=subsample_ratio, + shuffle=shuffle, + ) + if use_coalesce: + for k, v in g.edata.items(): + g.edata[k] = v.float() # dgl.to_simple() requires data is float + self.g = dgl.to_simple( + g, copy_ndata=True, copy_edata=True, aggregator="sum" + ) + + self.ndata = {k: v for k, v in self.g.ndata.items()} + self.edata = {k: v for k, v in self.g.edata.items()} + self.g.ndata.clear() + self.g.edata.clear() + self.print_fn("Save ndata and edata in class.") + self.print_fn("Clear ndata and edata in graph.") + + self.sampler = SEALSampler( + graph=self.g, hop=hop, num_workers=num_workers, print_fn=print_fn + ) + + def __call__(self, split_type): + if split_type == "train": + subsample_ratio = self.subsample_ratio + else: + subsample_ratio = 1 + + path = osp.join( + self.save_dir or "", + "{}_{}_{}-hop_{}-subsample.bin".format( + self.prefix, split_type, self.hop, subsample_ratio + ), + ) + + if osp.exists(path): + self.print_fn("Load existing processed {} files".format(split_type)) + graph_list, data = dgl.load_graphs(path) + dataset = GraphDataSet(graph_list, data["labels"]) + + else: + self.print_fn("Processed {} files not exist.".format(split_type)) + + edges, labels = self.generator(split_type) + self.print_fn("Generate {} edges totally.".format(edges.size(0))) + + graph_list, labels = self.sampler(edges, labels) + dataset = GraphDataSet(graph_list, labels) + dgl.save_graphs(path, graph_list, {"labels": labels}) + self.print_fn("Save preprocessed subgraph to {}".format(path)) + return dataset + + +def _transform_log_level(str_level): + if str_level == "info": + return logging.INFO + elif str_level == "warning": + return logging.WARNING + elif str_level == "critical": + return logging.CRITICAL + elif str_level == "debug": + return logging.DEBUG + elif str_level == "error": + return logging.ERROR + else: + raise KeyError("Log level error") + + +class LightLogging(object): + def __init__(self, log_path=None, log_name="lightlog", log_level="debug"): + log_level = _transform_log_level(log_level) + + if log_path: + if not log_path.endswith("/"): + log_path += "/" + if not os.path.exists(log_path): + os.mkdir(log_path) + + if log_name.endswith("-") or log_name.endswith("_"): + log_name = ( + log_path + + log_name + + time.strftime("%Y-%m-%d-%H:%M", time.localtime(time.time())) + + ".log" + ) + else: + log_name = ( + log_path + + log_name + + "_" + + time.strftime("%Y-%m-%d-%H-%M", time.localtime(time.time())) + + ".log" + ) + + logging.basicConfig( + level=log_level, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y-%m-%d-%H:%M", + handlers=[ + logging.FileHandler(log_name, mode="w"), + logging.StreamHandler(), + ], + ) + logging.info("Start Logging") + logging.info("Log file path: {}".format(log_name)) + + else: + logging.basicConfig( + level=log_level, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y-%m-%d-%H:%M", + handlers=[logging.StreamHandler()], + ) + logging.info("Start Logging") + + def debug(self, msg): + logging.debug(msg) + + def info(self, msg): + logging.info(msg) + + def critical(self, msg): + logging.critical(msg) + + def warning(self, msg): + logging.warning(msg) + + def error(self, msg): + logging.error(msg) + + +def data_prepare(graph, split_edge): + seal_data = SEALData( + g=graph, + split_edge=split_edge, + hop=1, + neg_samples=1, + subsample_ratio=0.1, + use_coalesce=True, + prefix="ogbl-collab", + save_dir="./processed", + num_workers=32, + print_fn=print, + ) + node_attribute = seal_data.ndata["feat"] + edge_weight = seal_data.edata["weight"].float() + return node_attribute, edge_weight diff --git a/hugegraph-ml/src/hugegraph_ml/tasks/fraud_detector_caregnn.py b/hugegraph-ml/src/hugegraph_ml/tasks/fraud_detector_caregnn.py new file mode 100644 index 00000000..fe14d6db --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/tasks/fraud_detector_caregnn.py @@ -0,0 +1,122 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import torch +from dgl import DGLGraph +from torch import nn +from sklearn.metrics import recall_score, roc_auc_score +from torch.nn.functional import softmax + + +class DetectorCaregnn: + def __init__(self, graph: DGLGraph, model: nn.Module): + self.graph = graph + self._model = model + self._device = "" + + def train( + self, + lr: float = 1e-3, + weight_decay: float = 0, + n_epochs: int = 200, + gpu: int = -1, + ): + + self._device = ( + f"cuda:{gpu}" if gpu != -1 and torch.cuda.is_available() else "cpu" + ) + self._model.to(self._device) + self.graph = self.graph.to(self._device) + labels = self.graph.ndata["label"].to(self._device) + feat = self.graph.ndata["feature"].to(self._device) + train_mask = self.graph.ndata["train_mask"] + val_mask = self.graph.ndata["val_mask"] + test_mask = self.graph.ndata["test_mask"] + train_idx = ( + torch.nonzero(train_mask, as_tuple=False).squeeze(1).to(self._device) + ) + val_idx = torch.nonzero(val_mask, as_tuple=False).squeeze(1).to(self._device) + test_idx = torch.nonzero(test_mask, as_tuple=False).squeeze(1).to(self._device) + rl_idx = torch.nonzero( + train_mask.to(self._device) & labels.bool(), as_tuple=False + ).squeeze(1) + _, cnt = torch.unique(labels, return_counts=True) + loss_fn = torch.nn.CrossEntropyLoss(weight=1 / cnt) + optimizer = torch.optim.Adam( + self._model.parameters(), lr=lr, weight_decay=weight_decay + ) + for epoch in range(n_epochs): + self._model.train() + logits_gnn, logits_sim = self._model(self.graph, feat) + tr_loss = loss_fn(logits_gnn[train_idx], labels[train_idx]) + 2 * loss_fn( + logits_sim[train_idx], labels[train_idx] + ) + + tr_recall = recall_score( + labels[train_idx].cpu(), + logits_gnn.data[train_idx].argmax(dim=1).cpu(), + ) + tr_auc = roc_auc_score( + labels[train_idx].cpu(), + softmax(logits_gnn, dim=1).data[train_idx][:, 1].cpu(), + ) + val_loss = loss_fn(logits_gnn[val_idx], labels[val_idx]) + 2 * loss_fn( + logits_sim[val_idx], labels[val_idx] + ) + val_recall = recall_score( + labels[val_idx].cpu(), logits_gnn.data[val_idx].argmax(dim=1).cpu() + ) + val_auc = roc_auc_score( + labels[val_idx].cpu(), + softmax(logits_gnn, dim=1).data[val_idx][:, 1].cpu(), + ) + optimizer.zero_grad() + tr_loss.backward() + optimizer.step() + print( + "Epoch {}, Train: Recall: {:.4f} AUC: {:.4f} Loss: {:.4f} | Val: Recall: {:.4f} AUC: {:.4f} Loss: {:.4f}".format( + epoch, + tr_recall, + tr_auc, + tr_loss.item(), + val_recall, + val_auc, + val_loss.item(), + ) + ) + self._model.RLModule(self.graph, epoch, rl_idx) + + def evaluate(self): + labels = self.graph.ndata["label"].to(self._device) + feat = self.graph.ndata["feature"].to(self._device) + test_mask = self.graph.ndata["test_mask"] + test_idx = torch.nonzero(test_mask, as_tuple=False).squeeze(1).to(self._device) + _, cnt = torch.unique(labels, return_counts=True) + loss_fn = torch.nn.CrossEntropyLoss(weight=1 / cnt) + self._model.eval() + logits_gnn, logits_sim = self._model.forward(self.graph, feat) + test_loss = loss_fn(logits_gnn[test_idx], labels[test_idx]) + 2 * loss_fn( + logits_sim[test_idx], labels[test_idx] + ) + test_recall = recall_score( + labels[test_idx].cpu(), logits_gnn[test_idx].argmax(dim=1).cpu() + ) + test_auc = roc_auc_score( + labels[test_idx].cpu(), + softmax(logits_gnn, dim=1).data[test_idx][:, 1].cpu(), + ) + return {"recall": test_recall, "accuracy": test_auc, "loss": test_loss.item()} diff --git a/hugegraph-ml/src/hugegraph_ml/tasks/hetero_sample_embed_gatne.py b/hugegraph-ml/src/hugegraph_ml/tasks/hetero_sample_embed_gatne.py new file mode 100644 index 00000000..62af5d2c --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/tasks/hetero_sample_embed_gatne.py @@ -0,0 +1,122 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import dgl +import torch +from torch import nn +from tqdm.auto import tqdm +import random +from hugegraph_ml.models.gatne import ( + construct_typenodes_from_graph, + generate_pairs, + NSLoss, + NeighborSampler, +) + + +class HeteroSampleEmbedGATNE: + def __init__(self, graph, model: nn.Module): + self.graph = graph + self._model = model + self._device = "" + + def train_and_embed( + self, + lr: float = 1e-3, + n_epochs: int = 200, + gpu: int = -1, + ): + self._device = ( + f"cuda:{gpu}" if gpu != -1 and torch.cuda.is_available() else "cpu" + ) + self._model = self._model.to(self._device) + self.graph = self.graph.to(self._device) + type_nodes = construct_typenodes_from_graph(self.graph) + edge_type_count = len(self.graph.etypes) + neighbor_samples = 10 + num_walks = 20 + num_workers = 4 + window_size = 5 + batch_size = 64 + num_sampled = 5 + embedding_size = 200 + all_walks = [] + for i in range(edge_type_count): + nodes = torch.LongTensor(type_nodes[i] * num_walks).to(self._device) + traces, types = dgl.sampling.random_walk( + self.graph, + nodes, + metapath=[self.graph.etypes[i]] * (neighbor_samples - 1), + ) + all_walks.append(traces) + + train_pairs = generate_pairs(all_walks, window_size, num_workers) + neighbor_sampler = NeighborSampler(self.graph, [neighbor_samples]) + train_dataloader = torch.utils.data.DataLoader( + train_pairs, + batch_size=batch_size, + collate_fn=neighbor_sampler.sample, + shuffle=True, + num_workers=num_workers, + pin_memory=True, + ) + nsloss = NSLoss(self.graph.number_of_nodes(), num_sampled, embedding_size) + self._model.to(self._device) + nsloss.to(self._device) + + optimizer = torch.optim.Adam( + [{"params": self._model.parameters()}, {"params": nsloss.parameters()}], + lr=lr, + ) + + tensors = [] + for epoch in range(n_epochs): + self._model.train() + random.shuffle(train_pairs) + + data_iter = tqdm( + train_dataloader, + desc="epoch %d" % (epoch), + total=(len(train_pairs) + (batch_size - 1)) // batch_size, + ) + avg_loss = 0.0 + for i, (block, head_invmap, tails, block_types) in enumerate(data_iter): + optimizer.zero_grad() + # embs: [batch_size, edge_type_count, embedding_size] + block_types = block_types.to(self._device) + embs = self._model(block[0].to(self._device))[head_invmap] + embs = embs.gather( + 1, + block_types.view(-1, 1, 1).expand(embs.shape[0], 1, embs.shape[2]), + )[:, 0] + loss = nsloss( + block[0].dstdata[dgl.NID][head_invmap].to(self._device), + embs, + tails.to(self._device), + ) + loss.backward() + optimizer.step() + avg_loss += loss.item() + + post_fix = { + "epoch": epoch, + "iter": i, + "avg_loss": avg_loss / (i + 1), + "loss": loss.item(), + } + data_iter.set_postfix(post_fix) diff --git a/hugegraph-ml/src/hugegraph_ml/tasks/link_prediction_pgnn.py b/hugegraph-ml/src/hugegraph_ml/tasks/link_prediction_pgnn.py new file mode 100644 index 00000000..8b03cbd6 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/tasks/link_prediction_pgnn.py @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import torch +import dgl +from torch import nn +from tqdm import trange +import numpy as np +from hugegraph_ml.models.pgnn import ( + get_dataset, + preselect_anchor, + train_model, + eval_model, +) + + +class LinkPredictionPGNN: + def __init__(self, graph, model: nn.Module): + self.graph = graph + self._model = model + self._device = "" + + def train( + self, + lr: float = 1e-3, + weight_decay: float = 0, + n_epochs: int = 200, + gpu: int = -1, + ): + self._device = ( + f"cuda:{gpu}" if gpu != -1 and torch.cuda.is_available() else "cpu" + ) + self._model.to(self._device) + data = get_dataset(self.graph) + # pre-sample anchor nodes and compute shortest distance values for all epochs + ( + g_list, + anchor_eid_list, + dist_max_list, + edge_weight_list, + ) = preselect_anchor(data) + optimizer = torch.optim.Adam( + self._model.parameters(), lr=lr, weight_decay=weight_decay + ) + loss_func = nn.BCEWithLogitsLoss() + best_auc_val = -1 + best_auc_test = -1 + for epoch in range(n_epochs): + if epoch == 200: + for param_group in optimizer.param_groups: + param_group["lr"] /= 10 + + g = dgl.graph(g_list[epoch]) + g.ndata["feat"] = torch.FloatTensor(data["feature"]) + g.edata["sp_dist"] = torch.FloatTensor(edge_weight_list[epoch]) + g_data = { + "graph": g.to(self._device), + "anchor_eid": anchor_eid_list[epoch], + "dists_max": dist_max_list[epoch], + } + + train_model(data, self._model, loss_func, optimizer, self._device, g_data) + + loss_train, auc_train, auc_val, auc_test = eval_model( + data, g_data, self._model, loss_func, self._device + ) + if auc_val > best_auc_val: + best_auc_val = auc_val + best_auc_test = auc_test + + if epoch % 100 == 0: + print( + epoch, + "Loss {:.4f}".format(loss_train), + "Train AUC: {:.4f}".format(auc_train), + "Val AUC: {:.4f}".format(auc_val), + "Test AUC: {:.4f}".format(auc_test), + "Best Val AUC: {:.4f}".format(best_auc_val), + "Best Test AUC: {:.4f}".format(best_auc_test), + ) diff --git a/hugegraph-ml/src/hugegraph_ml/tasks/link_prediction_seal.py b/hugegraph-ml/src/hugegraph_ml/tasks/link_prediction_seal.py new file mode 100644 index 00000000..c307cc10 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/tasks/link_prediction_seal.py @@ -0,0 +1,172 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from typing import Literal + +import torch +from dgl import DGLGraph, NID, EID +from torch import nn +from tqdm import tqdm +from dgl.dataloading import GraphDataLoader +from torch.nn import BCEWithLogitsLoss +import time +import numpy as np +from hugegraph_ml.models.seal import SEALData, DGCNN, evaluate_hits +from hugegraph_ml.utils.early_stopping import EarlyStopping + + +class LinkPredictionSeal: + def __init__(self, graph: DGLGraph, split_edge, model): + self.graph = graph + self._model = model + self.split_edge = split_edge + self._device = "" + self.train_loader = None + self.val_loader = None + self.test_loader = None + self.train_graphs = None + self.data_prepare() + + def data_prepare(self): + seal_data = SEALData( + g=self.graph, + split_edge=self.split_edge, + hop=1, + neg_samples=1, + subsample_ratio=0.1, + use_coalesce=True, + prefix="ogbl-collab", + save_dir="./processed", + num_workers=32, + print_fn=print, + ) + node_attribute = seal_data.ndata["feat"] + edge_weight = seal_data.edata["weight"].float() + train_data = seal_data("train") + val_data = seal_data("valid") + test_data = seal_data("test") + self.train_graphs = len(train_data.graph_list) + self.train_loader = GraphDataLoader(train_data, batch_size=32, num_workers=32) + self.val_loader = GraphDataLoader(val_data, batch_size=32, num_workers=32) + self.test_loader = GraphDataLoader(test_data, batch_size=32, num_workers=32) + + def _train( + self, + dataloader, + loss_fn, + optimizer, + num_graphs=32, + total_graphs=None, + ): + self._model.train() + + total_loss = 0 + for g, labels in tqdm(dataloader, ncols=100): + g = g.to(self._device) + labels = labels.to(self._device) + optimizer.zero_grad() + logits = self._model(g, g.ndata["z"], g.ndata[NID], g.edata[EID]) + loss = loss_fn(logits, labels) + loss.backward() + optimizer.step() + total_loss += loss.item() * num_graphs + return total_loss / total_graphs + + def train( + self, + lr: float = 1e-3, + n_epochs: int = 200, + gpu: int = -1, + ): + torch.manual_seed(2021) + self._device = ( + f"cuda:{gpu}" if gpu != -1 and torch.cuda.is_available() else "cpu" + ) + self._model.to(self._device) + self.graph = self.graph.to(self._device) + parameters = self._model.parameters() + optimizer = torch.optim.Adam(parameters, lr=lr) + loss_fn = BCEWithLogitsLoss() + print( + "Total parameters: {}".format( + sum([p.numel() for p in self._model.parameters()]) + ) + ) + + # train and evaluate loop + summary_val = [] + summary_test = [] + for epoch in range(n_epochs): + start_time = time.time() + loss = self._train( + dataloader=self.train_loader, + loss_fn=loss_fn, + optimizer=optimizer, + num_graphs=32, + total_graphs=self.train_graphs, + ) + train_time = time.time() + if epoch % 5 == 0: + val_pos_pred, val_neg_pred = self.evaluate(dataloader=self.val_loader) + test_pos_pred, test_neg_pred = self.evaluate( + dataloader=self.test_loader + ) + + val_metric = evaluate_hits( + "ogbl-collab", val_pos_pred, val_neg_pred, 50 + ) + test_metric = evaluate_hits( + "ogbl-collab", test_pos_pred, test_neg_pred, 50 + ) + evaluate_time = time.time() + print( + "Epoch-{}, train loss: {:.4f}, hits@{}: val-{:.4f}, test-{:.4f}, " + "cost time: train-{:.1f}s, total-{:.1f}s".format( + epoch, + loss, + 50, + val_metric, + test_metric, + train_time - start_time, + evaluate_time - start_time, + ) + ) + summary_val.append(val_metric) + summary_test.append(test_metric) + summary_test = np.array(summary_test) + + print("Experiment Results:") + print( + "Best hits@{}: {:.4f}, epoch: {}".format( + 50, np.max(summary_test), np.argmax(summary_test) + ) + ) + + @torch.no_grad() + def evaluate(self, dataloader): + self._model.eval() + y_pred, y_true = [], [] + for g, labels in tqdm(dataloader, ncols=100): + g = g.to(self._device) + logits = self._model(g, g.ndata["z"], g.ndata[NID], g.edata[EID]) + y_pred.append(logits.view(-1).cpu()) + y_true.append(labels.view(-1).cpu().to(torch.float)) + y_pred, y_true = torch.cat(y_pred), torch.cat(y_true) + pos_pred = y_pred[y_true == 1] + neg_pred = y_pred[y_true == 0] + return pos_pred, neg_pred diff --git a/hugegraph-ml/src/hugegraph_ml/tasks/node_classify_with_edge.py b/hugegraph-ml/src/hugegraph_ml/tasks/node_classify_with_edge.py new file mode 100644 index 00000000..57276ff6 --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/tasks/node_classify_with_edge.py @@ -0,0 +1,123 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from typing import Literal + +import torch +from dgl import DGLGraph +from torch import nn +from tqdm import trange + +from hugegraph_ml.utils.early_stopping import EarlyStopping + + +class NodeClassifyWithEdge: + def __init__(self, graph: DGLGraph, model: nn.Module): + self.graph = graph + self._model = model + self._device = "" + self._early_stopping = None + self._is_trained = False + self._check_graph() + + def _check_graph(self): + required_node_attrs = ["feat", "label", "train_mask", "val_mask", "test_mask"] + for attr in required_node_attrs: + if attr not in self.graph.ndata: + raise ValueError( + f"Graph is missing required node attribute '{attr}' in ndata." + ) + required_edge_attrs = ["feat"] + for attr in required_edge_attrs: + if attr not in self.graph.edata: + raise ValueError( + f"Graph is missing required edge attribute '{attr}' in edata." + ) + + def _evaluate(self, edge_feats, node_feats, labels, mask): + self._model.eval() + labels = labels[mask] + with torch.no_grad(): + logits = self._model.inference(self.graph, edge_feats, node_feats)[mask] + loss = self._model.loss(logits, labels) + _, predicted = torch.max(logits, dim=1) + accuracy = (predicted == labels).sum().item() / len(labels) + return {"accuracy": accuracy, "loss": loss.item()} + + def train( + self, + lr: float = 1e-3, + weight_decay: float = 0, + n_epochs: int = 200, + patience: int = float("inf"), + early_stopping_monitor: Literal["loss", "accuracy"] = "loss", + gpu: int = -1, + ): + # Set device for training + self._device = ( + f"cuda:{gpu}" if gpu != -1 and torch.cuda.is_available() else "cpu" + ) + self._early_stopping = EarlyStopping( + patience=patience, monitor=early_stopping_monitor + ) + self._model.to(self._device) + self.graph = self.graph.to(self._device) + # Get node features, labels, masks and move to device + edge_feats = self.graph.edata["feat"].to(self._device) + node_feats = self.graph.ndata["feat"].to(self._device) + labels = self.graph.ndata["label"].to(self._device) + train_mask = self.graph.ndata["train_mask"].to(self._device) + val_mask = self.graph.ndata["val_mask"].to(self._device) + optimizer = torch.optim.Adam( + self._model.parameters(), lr=lr, weight_decay=weight_decay + ) + # Training model + epochs = trange(n_epochs) + for epoch in epochs: + # train + self._model.train() + optimizer.zero_grad() + # forward pass, get logits, compute loss + logits = self._model(self.graph, edge_feats, node_feats) + logits_train_masked = logits[train_mask] + loss = self._model.loss(logits_train_masked, labels[train_mask]) + loss.backward() + optimizer.step() + # validation + valid_metrics = self._evaluate(edge_feats, node_feats, labels, val_mask) + # logs + epochs.set_description( + f"epoch {epoch} | train loss {loss.item():.4f} | val loss {valid_metrics['loss']:.4f}" + ) + # early stopping + self._early_stopping( + valid_metrics[self._early_stopping.monitor], self._model + ) + torch.cuda.empty_cache() + if self._early_stopping.early_stop: + break + self._early_stopping.load_best_model(self._model) + self._is_trained = True + + def evaluate(self): + test_mask = self.graph.ndata["test_mask"].to(self._device) + edge_feats = self.graph.edata["feat"].to(self._device) + node_feats = self.graph.ndata["feat"].to(self._device) + labels = self.graph.ndata["label"].to(self._device) + metrics = self._evaluate(edge_feats, node_feats, labels, test_mask) + return metrics diff --git a/hugegraph-ml/src/hugegraph_ml/tasks/node_classify_with_sample.py b/hugegraph-ml/src/hugegraph_ml/tasks/node_classify_with_sample.py new file mode 100644 index 00000000..393cd09c --- /dev/null +++ b/hugegraph-ml/src/hugegraph_ml/tasks/node_classify_with_sample.py @@ -0,0 +1,156 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from typing import Literal + +import torch +from dgl import DGLGraph +from torch import nn +from tqdm import trange +import dgl +import numpy as np + +from hugegraph_ml.utils.early_stopping import EarlyStopping + + +class NodeClassifyWithSample: + def __init__(self, graph: DGLGraph, model: nn.Module): + self.graph = graph + self._model = model + self.gpu = -1 + self._device = ( + f"cuda:{self.gpu}" + if self.gpu != -1 and torch.cuda.is_available() + else "cpu" + ) + self._early_stopping = None + self._is_trained = False + self.num_partitions = 100 + self.batch_size = 100 + self.sampler = dgl.dataloading.ClusterGCNSampler( + graph, + self.num_partitions, + ) + self.dataloader = dgl.dataloading.DataLoader( + self.graph, + torch.arange(self.num_partitions).to(self._device), + self.sampler, + device=self._device, + batch_size=self.batch_size, + shuffle=True, + drop_last=False, + num_workers=0, + use_uva=True, + ) + self._check_graph() + + def _check_graph(self): + required_node_attrs = ["feat", "label", "train_mask", "val_mask", "test_mask"] + for attr in required_node_attrs: + if attr not in self.graph.ndata: + raise ValueError( + f"Graph is missing required node attribute '{attr}' in ndata." + ) + + def train( + self, + lr: float = 1e-3, + weight_decay: float = 0, + n_epochs: int = 200, + patience: int = float("inf"), + early_stopping_monitor: Literal["loss", "accuracy"] = "loss", + ): + # Set device for training + early_stopping = EarlyStopping( + patience=patience, monitor=early_stopping_monitor + ) + self._model.to(self._device) + # Get node features, labels, masks and move to device + feats = self.graph.ndata["feat"].to(self._device) + labels = self.graph.ndata["label"].to(self._device) + train_mask = self.graph.ndata["train_mask"].to(self._device) + val_mask = self.graph.ndata["val_mask"].to(self._device) + optimizer = torch.optim.Adam( + self._model.parameters(), lr=lr, weight_decay=weight_decay + ) + # Training model + loss_fn = nn.CrossEntropyLoss() + epochs = trange(n_epochs) + for epoch in epochs: + # train + self._model.train() + for it, sg in enumerate(self.dataloader): + sg_feats = feats[sg.ndata["_ID"]] + sg_labels = labels[sg.ndata["_ID"]] + sg_train_msak = train_mask[sg.ndata["_ID"]].bool() + logits = self._model(sg, sg_feats) + train_loss = loss_fn(logits[sg_train_msak], sg_labels[sg_train_msak]) + optimizer.zero_grad() + train_loss.backward() + optimizer.step() + # validation + valid_metrics = self.evaluate_sg( + sg=sg, + sg_feats=sg_feats, + labels=labels, + val_mask=val_mask, + ) + # logs + epochs.set_description( + f"epoch {epoch} | it {it} | train loss {train_loss.item():.4f} | val loss {valid_metrics['loss']:.4f}" + ) + # early stopping + early_stopping(valid_metrics[early_stopping.monitor], self._model) + torch.cuda.empty_cache() + if early_stopping.early_stop: + break + early_stopping.load_best_model(self._model) + + def evaluate_sg(self, sg, sg_feats, labels, val_mask): + self._model.eval() + sg_val_msak = val_mask[sg.ndata["_ID"]].bool() + sg_val_labels = labels[sg_val_msak] + with torch.no_grad(): + sg_val_logits = self._model.inference(sg, sg_feats)[sg_val_msak] + val_loss = self._model.loss(sg_val_logits, sg_val_labels) + _, predicted = torch.max(sg_val_logits, dim=1) + accuracy = (predicted == sg_val_labels).sum().item() / len(sg_val_labels) + return {"accuracy": accuracy, "loss": val_loss.item()} + + def evaluate(self): + test_mask = self.graph.ndata["test_mask"] + feats = self.graph.ndata["feat"] + labels = self.graph.ndata["label"] + test_logits = [] + test_labels = [] + total_loss = 0 + with torch.no_grad(): + for it, sg in enumerate(self.dataloader): + sg_feats = feats[sg.ndata["_ID"]] + sg_labels = labels[sg.ndata["_ID"]] + sg_test_msak = test_mask[sg.ndata["_ID"]].bool() + sg_test_labels = sg_labels[sg_test_msak] + sg_test_logits = self._model.inference(sg, sg_feats)[sg_test_msak] + loss = self._model.loss(sg_test_logits, sg_test_labels) + total_loss += loss + test_logits.append(sg_test_logits) + test_labels.append(sg_test_labels) + test_logits = torch.tensor(np.vstack(test_logits)) + _, predicted = torch.max(test_logits, dim=1) + accuracy = (predicted == test_labels[0]).sum().item() / len(test_labels[0]) + return {"accuracy": accuracy, "total_loss": total_loss.item()} diff --git a/hugegraph-ml/src/hugegraph_ml/utils/dgl2hugegraph_utils.py b/hugegraph-ml/src/hugegraph_ml/utils/dgl2hugegraph_utils.py index c1900422..cdc4ea3f 100644 --- a/hugegraph-ml/src/hugegraph_ml/utils/dgl2hugegraph_utils.py +++ b/hugegraph-ml/src/hugegraph_ml/utils/dgl2hugegraph_utils.py @@ -27,7 +27,11 @@ import torch from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset, LegacyTUDataset, GINDataset, \ get_download_dir -from dgl.data.utils import _get_dgl_url, download +from dgl.data.utils import _get_dgl_url, download, load_graphs +import networkx as nx +from ogb.linkproppred import DglLinkPropPredDataset +import pandas as pd +import json from pyhugegraph.api.graph import GraphManager from pyhugegraph.api.schema import SchemaManager from pyhugegraph.client import PyHugeClient @@ -280,6 +284,647 @@ def import_hetero_graph_from_dgl( if len(edatas) > 0: _add_batch_edges(client_graph, edatas) +def import_hetero_graph_from_dgl_no_feat( + dataset_name, + ip: str = "127.0.0.1", + port: str = "8080", + graph: str = "hugegraph", + user: str = "", + pwd: str = "", + graphspace: Optional[str] = None, +): + # dataset download from: + # https://s3.us-west-2.amazonaws.com/dgl-data/dataset/recsys/GATNE/amazon.zip + dataset_name = dataset_name.upper() + if dataset_name == "AMAZONGATNE": + hetero_graph = load_training_data_gatne() + else: + raise ValueError("dataset not supported") + client: PyHugeClient = PyHugeClient( + ip=ip, port=port, graph=graph, user=user, pwd=pwd, graphspace=graphspace + ) + client_schema: SchemaManager = client.schema() + client_graph: GraphManager = client.graph() + + ntype_to_vertex_label = {} + ntype_idx_to_vertex_id = {} + for ntype in hetero_graph.ntypes: + # create vertex schema + vertex_label = f"{dataset_name}_{ntype}_v" + ntype_to_vertex_label[ntype] = vertex_label + client_schema.vertexLabel(vertex_label).useAutomaticId().ifNotExist().create() + # add vertices for batch of ntype + idx_to_vertex_id = {} + vdatas = [] + idxs = [] + for idx in range(hetero_graph.number_of_nodes(ntype=ntype)): + properties = {} + vdata = [vertex_label, properties] + vdatas.append(vdata) + idxs.append(idx) + if len(vdatas) == MAX_BATCH_NUM: + idx_to_vertex_id.update(_add_batch_vertices(client_graph, vdatas, idxs)) + vdatas.clear() + idxs.clear() + if len(vdatas) > 0: + idx_to_vertex_id.update(_add_batch_vertices(client_graph, vdatas, idxs)) + ntype_idx_to_vertex_id[ntype] = idx_to_vertex_id + + # add edges + edatas = [] + for canonical_etype in hetero_graph.canonical_etypes: + # create edge schema + src_type, etype, dst_type = canonical_etype + edge_label = f"{dataset_name}_{etype}_e" + client_schema.edgeLabel(edge_label).sourceLabel( + ntype_to_vertex_label[src_type] + ).targetLabel(ntype_to_vertex_label[dst_type]).ifNotExist().create() + # add edges for batch of canonical_etype + srcs, dsts = hetero_graph.edges(etype=canonical_etype) + for src, dst in zip(srcs.numpy(), dsts.numpy()): + edata = [ + edge_label, + ntype_idx_to_vertex_id[src_type][src], + ntype_idx_to_vertex_id[dst_type][dst], + ntype_to_vertex_label[src_type], + ntype_to_vertex_label[dst_type], + {}, + ] + edatas.append(edata) + if len(edatas) == MAX_BATCH_NUM: + _add_batch_edges(client_graph, edatas) + edatas.clear() + if len(edatas) > 0: + _add_batch_edges(client_graph, edatas) + + +def import_graph_from_nx( + dataset_name, + ip: str = "127.0.0.1", + port: str = "8080", + graph: str = "hugegraph", + user: str = "", + pwd: str = "", + graphspace: Optional[str] = None, +): + dataset_name = dataset_name.upper() + if dataset_name == "CAVEMAN": + dataset = nx.connected_caveman_graph(20, 20) + else: + raise ValueError("dataset not supported") + + client: PyHugeClient = PyHugeClient( + ip=ip, port=port, graph=graph, user=user, pwd=pwd, graphspace=graphspace + ) + client_schema: SchemaManager = client.schema() + client_graph: GraphManager = client.graph() + # create property schema + # check props and create vertex label + vertex_label = f"{dataset_name}_vertex" + props_value = {} + client_schema.vertexLabel(vertex_label).useAutomaticId().ifNotExist().create() + # add vertices for batch (note MAX_BATCH_NUM) + idx_to_vertex_id = {} + vdatas = [] + vidxs = [] + for idx in dataset.nodes: + vdata = [vertex_label, {}] + vdatas.append(vdata) + vidxs.append(idx) + if len(vdatas) == MAX_BATCH_NUM: + idx_to_vertex_id.update(_add_batch_vertices(client_graph, vdatas, vidxs)) + vdatas.clear() + vidxs.clear() + # add rest vertices + if len(vdatas) > 0: + idx_to_vertex_id.update(_add_batch_vertices(client_graph, vdatas, vidxs)) + + # add edges for batch + edge_label = f"{dataset_name}_edge" + client_schema.edgeLabel(edge_label).sourceLabel(vertex_label).targetLabel( + vertex_label + ).ifNotExist().create() + edatas = [] + for edge in dataset.edges: + edata = [ + edge_label, + idx_to_vertex_id[edge[0]], + idx_to_vertex_id[edge[1]], + vertex_label, + vertex_label, + {}, + ] + edatas.append(edata) + if len(edatas) == MAX_BATCH_NUM: + _add_batch_edges(client_graph, edatas) + edatas.clear() + if len(edatas) > 0: + _add_batch_edges(client_graph, edatas) + + +def import_graph_from_dgl_with_edge_feat( + dataset_name, + ip: str = "127.0.0.1", + port: str = "8080", + graph: str = "hugegraph", + user: str = "", + pwd: str = "", + graphspace: Optional[str] = None, +): + dataset_name = dataset_name.upper() + if dataset_name == "CORA": + dataset_dgl = CoraGraphDataset(verbose=False) + elif dataset_name == "CITESEER": + dataset_dgl = CiteseerGraphDataset(verbose=False) + elif dataset_name == "PUBMED": + dataset_dgl = PubmedGraphDataset(verbose=False) + else: + raise ValueError("dataset not supported") + graph_dgl = dataset_dgl[0] + + client: PyHugeClient = PyHugeClient( + ip=ip, port=port, graph=graph, user=user, pwd=pwd, graphspace=graphspace + ) + client_schema: SchemaManager = client.schema() + client_graph: GraphManager = client.graph() + # create property schema + client_schema.propertyKey( + "feat" + ).asDouble().valueList().ifNotExist().create() # node features + client_schema.propertyKey("edge_feat").asDouble().valueList().ifNotExist().create() + client_schema.propertyKey("label").asLong().ifNotExist().create() + client_schema.propertyKey("train_mask").asInt().ifNotExist().create() + client_schema.propertyKey("val_mask").asInt().ifNotExist().create() + client_schema.propertyKey("test_mask").asInt().ifNotExist().create() + # check props and create vertex label + vertex_label = f"{dataset_name}_edge_feat_vertex" + node_all_props = ["feat", "label", "train_mask", "val_mask", "test_mask"] + node_props = [p for p in node_all_props if p in graph_dgl.ndata] + node_props_value = {} + for p in node_props: + node_props_value[p] = graph_dgl.ndata[p].tolist() + client_schema.vertexLabel(vertex_label).useAutomaticId().properties( + *node_props + ).ifNotExist().create() + # add vertices for batch (note MAX_BATCH_NUM) + idx_to_vertex_id = {} + vdatas = [] + vidxs = [] + for idx in range(graph_dgl.number_of_nodes()): + # extract props + properties = { + p: ( + int(node_props_value[p][idx]) + if isinstance(node_props_value[p][idx], bool) + else node_props_value[p][idx] + ) + for p in node_props + } + vdata = [vertex_label, properties] + vdatas.append(vdata) + vidxs.append(idx) + if len(vdatas) == MAX_BATCH_NUM: + idx_to_vertex_id.update(_add_batch_vertices(client_graph, vdatas, vidxs)) + vdatas.clear() + vidxs.clear() + # add rest vertices + if len(vdatas) > 0: + idx_to_vertex_id.update(_add_batch_vertices(client_graph, vdatas, vidxs)) + + # add edges for batch + edge_label = f"{dataset_name}_edge_feat_edge" + edge_all_props = ["edge_feat"] + + client_schema.edgeLabel(edge_label).sourceLabel(vertex_label).targetLabel( + vertex_label + ).properties(*edge_all_props).ifNotExist().create() + edges_src, edges_dst = graph_dgl.edges() + edatas = [] + for src, dst in zip(edges_src.numpy(), edges_dst.numpy()): + properties = {p: (torch.rand(8).tolist()) for p in edge_all_props} + edata = [ + edge_label, + idx_to_vertex_id[src], + idx_to_vertex_id[dst], + vertex_label, + vertex_label, + properties, + ] + edatas.append(edata) + if len(edatas) == MAX_BATCH_NUM: + _add_batch_edges(client_graph, edatas) + edatas.clear() + if len(edatas) > 0: + _add_batch_edges(client_graph, edatas) + + +def import_graph_from_ogb( + dataset_name, + ip: str = "127.0.0.1", + port: str = "8080", + graph: str = "hugegraph", + user: str = "", + pwd: str = "", + graphspace: Optional[str] = None, +): + if dataset_name == "ogbl-collab": + dataset_dgl = DglLinkPropPredDataset(name=dataset_name) + else: + raise ValueError("dataset not supported") + graph_dgl = dataset_dgl[0] + split_edges = dataset_dgl.get_edge_split() + + client: PyHugeClient = PyHugeClient( + ip=ip, port=port, graph=graph, user=user, pwd=pwd, graphspace=graphspace + ) + client_schema: SchemaManager = client.schema() + client_graph: GraphManager = client.graph() + # create property schema + client_schema.propertyKey( + "feat" + ).asDouble().valueList().ifNotExist().create() # node features + client_schema.propertyKey("year").asDouble().valueList().ifNotExist().create() + client_schema.propertyKey("weight").asDouble().valueList().ifNotExist().create() + + # check props and create vertex label + vertex_label = f"{dataset_name}_vertex" + node_all_props = ["feat"] + node_props = [p for p in node_all_props if p in graph_dgl.ndata] + node_props_value = {} + for p in node_props: + node_props_value[p] = graph_dgl.ndata[p].tolist() + client_schema.vertexLabel(vertex_label).useAutomaticId().properties( + *node_props + ).ifNotExist().create() + + # add vertices for batch (note MAX_BATCH_NUM) + idx_to_vertex_id = {} + vdatas = [] + vidxs = [] + max_nodes = 10000 + for idx in range(graph_dgl.number_of_nodes()): + if idx <= max_nodes: + # extract props + properties = { + p: ( + int(node_props_value[p][idx]) + if isinstance(node_props_value[p][idx], bool) + else node_props_value[p][idx] + ) + for p in node_props + } + vdata = [vertex_label, properties] + vdatas.append(vdata) + vidxs.append(idx) + if len(vdatas) == MAX_BATCH_NUM: + idx_to_vertex_id.update( + _add_batch_vertices(client_graph, vdatas, vidxs) + ) + vdatas.clear() + vidxs.clear() + # add rest vertices + if len(vdatas) > 0: + idx_to_vertex_id.update(_add_batch_vertices(client_graph, vdatas, vidxs)) + + # add edges for batch + edge_label = f"{dataset_name}_edge" + edge_all_props = ["year", "weight"] + edge_props_value = {} + for p in edge_all_props: + edge_props_value[p] = graph_dgl.edata[p].tolist() + client_schema.edgeLabel(edge_label).sourceLabel(vertex_label).targetLabel( + vertex_label + ).properties(*edge_all_props).ifNotExist().create() + edges_src, edges_dst = graph_dgl.edges() + edatas = [] + for src, dst in zip(edges_src.numpy(), edges_dst.numpy()): + if src <= max_nodes and dst <= max_nodes: + properties = { + p: ( + int(edge_props_value[p][idx]) + if isinstance(edge_props_value[p][idx], bool) + else edge_props_value[p][idx] + ) + for p in edge_all_props + } + edata = [ + edge_label, + idx_to_vertex_id[src], + idx_to_vertex_id[dst], + vertex_label, + vertex_label, + properties, + ] + edatas.append(edata) + if len(edatas) == MAX_BATCH_NUM: + _add_batch_edges(client_graph, edatas) + edatas.clear() + if len(edatas) > 0: + _add_batch_edges(client_graph, edatas) + print("begin edge split") + import_split_edge_from_ogb( + dataset_name=dataset_name, + idx_to_vertex_id=idx_to_vertex_id, + max_nodes=max_nodes, + ) + + +def import_split_edge_from_ogb( + dataset_name, + idx_to_vertex_id, + max_nodes: int, + ip: str = "127.0.0.1", + port: str = "8080", + graph: str = "hugegraph", + user: str = "", + pwd: str = "", + graphspace: Optional[str] = None, +): + if dataset_name == "ogbl-collab": + dataset_dgl = DglLinkPropPredDataset(name=dataset_name) + else: + raise ValueError("dataset not supported") + split_edges = dataset_dgl.get_edge_split() + + client: PyHugeClient = PyHugeClient( + ip=ip, port=port, graph=graph, user=user, pwd=pwd, graphspace=graphspace + ) + client_schema: SchemaManager = client.schema() + client_graph: GraphManager = client.graph() + # create property schema + client_schema.propertyKey("train_edge_mask").asInt().ifNotExist().create() + client_schema.propertyKey("train_year_mask").asInt().ifNotExist().create() + client_schema.propertyKey("train_weight_mask").asInt().ifNotExist().create() + client_schema.propertyKey("valid_edge_mask").asInt().ifNotExist().create() + client_schema.propertyKey("valid_weight_mask").asInt().ifNotExist().create() + client_schema.propertyKey("valid_year_mask").asInt().ifNotExist().create() + client_schema.propertyKey("valid_edge_neg_mask").asInt().ifNotExist().create() + client_schema.propertyKey("test_edge_mask").asInt().ifNotExist().create() + client_schema.propertyKey("test_weight_mask").asInt().ifNotExist().create() + client_schema.propertyKey("test_year_mask").asInt().ifNotExist().create() + client_schema.propertyKey("test_edge_neg_mask").asInt().ifNotExist().create() + edge_all_props = [ + "train_edge_mask", + "train_year_mask", + "train_weight_mask", + "valid_edge_mask", + "valid_weight_mask", + "valid_year_mask", + "valid_edge_neg_mask", + "test_edge_mask", + "test_weight_mask", + "test_year_mask", + "test_edge_neg_mask", + ] + edge_props = [ + "train_edge_mask", + "valid_edge_mask", + "valid_edge_neg_mask", + "test_edge_mask", + "test_edge_neg_mask", + ] + # add edges for batch + vertex_label = f"{dataset_name}_vertex" + edge_label = f"{dataset_name}_split_edge" + client_schema.edgeLabel(edge_label).sourceLabel(vertex_label).targetLabel( + vertex_label + ).properties(*edge_all_props).ifNotExist().create() + edges = {} + edges["train_edge_mask"] = split_edges["train"]["edge"] + edges["train_year_mask"] = split_edges["train"]["year"] + edges["train_weight_mask"] = split_edges["train"]["weight"] + edges["valid_edge_mask"] = split_edges["valid"]["edge"] + edges["valid_weight_mask"] = split_edges["valid"]["weight"] + edges["valid_year_mask"] = split_edges["valid"]["year"] + edges["valid_edge_neg_mask"] = split_edges["valid"]["edge_neg"] + edges["test_edge_mask"] = split_edges["test"]["edge"] + edges["test_weight_mask"] = split_edges["test"]["weight"] + edges["test_year_mask"] = split_edges["test"]["year"] + edges["test_edge_neg_mask"] = split_edges["test"]["edge_neg"] + init_ogb_split_edge( + "train", + "valid", + "test", + "", + edges, + max_nodes, + edge_props, + vertex_label, + edge_label, + idx_to_vertex_id, + client_graph, + ) + init_ogb_split_edge( + "valid", + "train", + "test", + "", + edges, + max_nodes, + edge_props, + vertex_label, + edge_label, + idx_to_vertex_id, + client_graph, + ) + init_ogb_split_edge( + "valid", + "train", + "test", + "neg_", + edges, + max_nodes, + edge_props, + vertex_label, + edge_label, + idx_to_vertex_id, + client_graph, + ) + init_ogb_split_edge( + "test", + "train", + "valid", + "", + edges, + max_nodes, + edge_props, + vertex_label, + edge_label, + idx_to_vertex_id, + client_graph, + ) + init_ogb_split_edge( + "test", + "train", + "valid", + "neg_", + edges, + max_nodes, + edge_props, + vertex_label, + edge_label, + idx_to_vertex_id, + client_graph, + ) + + +def import_hetero_graph_from_dgl_bgnn( + dataset_name, + ip: str = "127.0.0.1", + port: str = "8080", + graph: str = "hugegraph", + user: str = "", + pwd: str = "", + graphspace: Optional[str] = None, +): + # dataset download from : https://www.dropbox.com/s/verx1evkykzli88/datasets.zip + # Extract zip folder in this directory + dataset_name = dataset_name.upper() + if dataset_name == "AVAZU": + hetero_graph = read_input() + else: + raise ValueError("dataset not supported") + client: PyHugeClient = PyHugeClient( + ip=ip, port=port, graph=graph, user=user, pwd=pwd, graphspace=graphspace + ) + client_schema: SchemaManager = client.schema() + client_graph: GraphManager = client.graph() + + client_schema.propertyKey("feat").asInt().valueList().ifNotExist().create() + client_schema.propertyKey("class").asDouble().valueList().ifNotExist().create() + client_schema.propertyKey("cat_features").asInt().valueList().ifNotExist().create() + client_schema.propertyKey("train_mask").asInt().ifNotExist().create() + client_schema.propertyKey("val_mask").asInt().ifNotExist().create() + client_schema.propertyKey("test_mask").asInt().ifNotExist().create() + + ntype_to_vertex_label = {} + ntype_idx_to_vertex_id = {} + for ntype in hetero_graph.ntypes: + # create vertex schema + vertex_label = f"{dataset_name}_{ntype}_v" + ntype_to_vertex_label[ntype] = vertex_label + all_props = [ + "feat", + "class", + "cat_features", + "train_mask", + "val_mask", + "test_mask", + ] + # check properties + props = [p for p in all_props if p in hetero_graph.nodes[ntype].data] + client_schema.vertexLabel(vertex_label).useAutomaticId().properties( + *props + ).ifNotExist().create() + props_value = {} + for p in props: + props_value[p] = hetero_graph.nodes[ntype].data[p].tolist() + # add vertices for batch of ntype + idx_to_vertex_id = {} + vdatas = [] + idxs = [] + for idx in range(hetero_graph.number_of_nodes(ntype=ntype)): + properties = { + p: ( + int(props_value[p][idx]) + if isinstance(props_value[p][idx], bool) + else props_value[p][idx] + ) + for p in props + } + vdata = [vertex_label, properties] + vdatas.append(vdata) + idxs.append(idx) + if len(vdatas) == MAX_BATCH_NUM: + idx_to_vertex_id.update(_add_batch_vertices(client_graph, vdatas, idxs)) + vdatas.clear() + idxs.clear() + if len(vdatas) > 0: + idx_to_vertex_id.update(_add_batch_vertices(client_graph, vdatas, idxs)) + ntype_idx_to_vertex_id[ntype] = idx_to_vertex_id + + # add edges + edatas = [] + for canonical_etype in hetero_graph.canonical_etypes: + # create edge schema + src_type, etype, dst_type = canonical_etype + edge_label = f"{dataset_name}_{etype}_e" + client_schema.edgeLabel(edge_label).sourceLabel( + ntype_to_vertex_label[src_type] + ).targetLabel(ntype_to_vertex_label[dst_type]).ifNotExist().create() + # add edges for batch of canonical_etype + srcs, dsts = hetero_graph.edges(etype=canonical_etype) + for src, dst in zip(srcs.numpy(), dsts.numpy()): + edata = [ + edge_label, + ntype_idx_to_vertex_id[src_type][src], + ntype_idx_to_vertex_id[dst_type][dst], + ntype_to_vertex_label[src_type], + ntype_to_vertex_label[dst_type], + {}, + ] + edatas.append(edata) + if len(edatas) == MAX_BATCH_NUM: + _add_batch_edges(client_graph, edatas) + edatas.clear() + if len(edatas) > 0: + _add_batch_edges(client_graph, edatas) + + +def init_ogb_split_edge( + a, + b, + c, + d, + edges, + max_nodes, + edge_props, + vertex_label, + edge_label, + idx_to_vertex_id, + client_graph, +): + edatas = [] + for idx, edge in enumerate(edges[f"{a}_edge_{d}mask"]): + if int(edge[0]) <= max_nodes and int(edge[1]) <= max_nodes: + properties = {q: (int(q == f"{a}_edge_{d}mask")) for q in edge_props} + if d != "neg_": + properties2 = { + f"{a}_year_mask": int(edges[f"{a}_year_mask"][idx]), + f"{a}_weight_mask": int(edges[f"{a}_weight_mask"][idx]), + } + properties3 = { + f"{b}_year_mask": -1, + f"{b}_weight_mask": -1, + f"{c}_year_mask": -1, + f"{c}_weight_mask": -1, + } + properties.update(properties2) + properties.update(properties3) + else: + properties2 = { + f"{a}_year_mask": -1, + f"{a}_weight_mask": -1, + f"{b}_year_mask": -1, + f"{b}_weight_mask": -1, + f"{c}_year_mask": -1, + f"{c}_weight_mask": -1, + } + properties.update(properties2) + edata = [ + edge_label, + idx_to_vertex_id[int(edge[0])], + idx_to_vertex_id[int(edge[1])], + vertex_label, + vertex_label, + properties, + ] + edatas.append(edata) + if len(edatas) == MAX_BATCH_NUM: + _add_batch_edges(client_graph, edatas) + edatas.clear() + if len(edatas) > 0: + _add_batch_edges(client_graph, edatas) def _add_batch_vertices(client_graph, vdatas, vidxs): vertices = client_graph.addVertices(vdatas) @@ -361,6 +1006,99 @@ def load_acm_raw(): return hgraph +def read_input(): + # reference: https://github.com/dmlc/dgl/blob/master/examples/pytorch/bgnn/run.py + # I added X, y, cat_features and masks into graph + input_folder = "dataset/avazu" + X = pd.read_csv(f"{input_folder}/X.csv") + y = pd.read_csv(f"{input_folder}/y.csv") + + categorical_columns = [] + if os.path.exists(f"{input_folder}/cat_features.txt"): + with open(f"{input_folder}/cat_features.txt") as f: + for line in f: + if line.strip(): + categorical_columns.append(line.strip()) + + cat_features = None + if categorical_columns: + columns = X.columns + cat_features = np.where(columns.isin(categorical_columns))[0] + + for col in list(columns[cat_features]): + X[col] = X[col].astype(str) + + gs, _ = load_graphs(f"{input_folder}/graph.dgl") + graph = gs[0] + + with open(f"{input_folder}/masks.json") as f: + masks = json.load(f) + + # add X + features = [[int(x) for x in row] for row in X.values] + features_tensor = torch.tensor(features, dtype=torch.int32) + graph.ndata["feat"] = features_tensor + + # add y + y_tensor = torch.tensor(y.values, dtype=torch.float64) + graph.ndata["class"] = y_tensor + + # add masks + for mask_name, node_ids in masks["0"].items(): + mask_tensor = torch.zeros(graph.number_of_nodes(), dtype=torch.int32) + mask_tensor[node_ids] = 1 + graph.ndata[f"{mask_name}_mask"] = mask_tensor + + # add cat_features + cat_features_tensor = torch.tensor(cat_features, dtype=torch.int32) + graph.ndata["cat_features"] = torch.repeat_interleave( + cat_features_tensor[None, :], repeats=graph.number_of_nodes(), dim=0 + ) + + return graph + + +def load_training_data_gatne(): + # reference: https://github.com/dmlc/dgl/blob/master/examples/pytorch/GATNE-T/src/utils.py + # reference: https://github.com/dmlc/dgl/blob/master/examples/pytorch/GATNE-T/src/main.py + f_name = "dataset/amazon/train.txt" + print("We are loading data from:", f_name) + edge_data_by_type = dict() + with open(f_name, "r") as f: + for line in f: + words = line[:-1].split(" ") # line[-1] == '\n' + if words[0] not in edge_data_by_type: + edge_data_by_type[words[0]] = list() + x, y = words[1], words[2] + edge_data_by_type[words[0]].append((x, y)) + nodes, index2word = [], [] + for edge_type in edge_data_by_type: + node1, node2 = zip(*edge_data_by_type[edge_type]) + index2word = index2word + list(node1) + list(node2) + index2word = list(set(index2word)) + vocab = {} + i = 0 + for word in index2word: + vocab[word] = i + i = i + 1 + for edge_type in edge_data_by_type: + node1, node2 = zip(*edge_data_by_type[edge_type]) + tmp_nodes = list(set(list(node1) + list(node2))) + tmp_nodes = [vocab[word] for word in tmp_nodes] + nodes.append(tmp_nodes) + node_type = "_N" # '_N' can be replaced by an arbitrary name + data_dict = dict() + num_nodes_dict = {node_type: len(vocab)} + for edge_type in edge_data_by_type: + tmp_data = edge_data_by_type[edge_type] + src = [] + dst = [] + for edge in tmp_data: + src.extend([vocab[edge[0]], vocab[edge[1]]]) + dst.extend([vocab[edge[1]], vocab[edge[0]]]) + data_dict[(node_type, edge_type, node_type)] = (src, dst) + graph = dgl.heterograph(data_dict, num_nodes_dict) + return graph def _get_mask(size, indices): mask = torch.zeros(size) @@ -373,3 +1111,8 @@ def _get_mask(size, indices): import_graph_from_dgl("CORA") import_graphs_from_dgl("MUTAG") import_hetero_graph_from_dgl("ACM") + import_graph_from_nx("CAVEMAN") + import_graph_from_dgl_with_edge_feat("CORA") + import_graph_from_ogb("ogbl-collab") + import_hetero_graph_from_dgl_bgnn("AVAZU") + import_hetero_graph_from_dgl_no_feat("amazongatne") diff --git a/hugegraph-ml/src/tests/test_examples/test_examples.py b/hugegraph-ml/src/tests/test_examples/test_examples.py index 6ed8b812..861b240c 100644 --- a/hugegraph-ml/src/tests/test_examples/test_examples.py +++ b/hugegraph-ml/src/tests/test_examples/test_examples.py @@ -23,7 +23,18 @@ from hugegraph_ml.examples.grace_example import grace_example from hugegraph_ml.examples.grand_example import grand_example from hugegraph_ml.examples.jknet_example import jknet_example - +from hugegraph_ml.examples.agnn_example import agnn_example +from hugegraph_ml.examples.appnp_example import appnp_example +from hugegraph_ml.examples.arma_example import arma_example +from hugegraph_ml.examples.bgnn_example import bgnn_example +from hugegraph_ml.examples.bgrl_example import bgrl_example +from hugegraph_ml.examples.care_gnn_example import care_gnn_example +from hugegraph_ml.examples.cluster_gcn_example import cluster_gcn_example +from hugegraph_ml.examples.correct_and_smooth_example import cs_example +from hugegraph_ml.examples.dagnn_example import dagnn_example +from hugegraph_ml.examples.deepergcn_example import deepergcn_example +from hugegraph_ml.examples.pgnn_example import pgnn_example +from hugegraph_ml.examples.seal_example import seal_example class TestHugegraph2DGL(unittest.TestCase): def setUp(self): @@ -64,3 +75,75 @@ def test_gin_example(self): gin_example(n_epochs=self.test_n_epochs) except ValueError: self.fail("model gin example failed") + + def test_agnn_example(self): + try: + agnn_example(n_epochs=self.test_n_epochs) + except ValueError: + self.fail("model agnn example failed") + + def test_appnp_example(self): + try: + appnp_example(n_epochs=self.test_n_epochs) + except ValueError: + self.fail("model appnp example failed") + + def test_arma_example(self): + try: + arma_example(n_epochs=self.test_n_epochs) + except ValueError: + self.fail("model arma example failed") + + def test_bgnn_example(self): + try: + bgnn_example(n_epochs=self.test_n_epochs) + except ValueError: + self.fail("model bgnn example failed") + + def test_bgrl_example(self): + try: + bgrl_example(n_epochs=self.test_n_epochs) + except ValueError: + self.fail("model bgrl example failed") + + def test_cluster_gcn_example(self): + try: + cluster_gcn_example(n_epochs=self.test_n_epochs) + except ValueError: + self.fail("model cluster-gcn example failed") + + def test_correct_and_smooth_example(self): + try: + cs_example(n_epochs=self.test_n_epochs) + except ValueError: + self.fail("model correct and smooth example failed") + + def test_dagnn_example(self): + try: + dagnn_example(n_epochs=self.test_n_epochs) + except ValueError: + self.fail("model dagnn example failed") + + def test_deepergcn_example(self): + try: + deepergcn_example(n_epochs=self.test_n_epochs) + except ValueError: + self.fail("model deepergcn example failed") + + def test_pgnn_example(self): + try: + pgnn_example(n_epochs=self.test_n_epochs) + except ValueError: + self.fail("model p-gnn example failed") + + def test_seal_example(self): + try: + seal_example(n_epochs=self.test_n_epochs) + except ValueError: + self.fail("model seal example failed") + + def test_care_gnn_example(self): + try: + care_gnn_example(n_epochs=self.test_n_epochs) + except ValueError: + self.fail("model care-gnn example failed")