Adapt tpch-example to multitable

s1ck · s1ck · commit 26839c69722e · 2025-02-18T15:35:05.000+01:00
diff --git a/snowgraph/tpch-example.sql b/snowgraph/tpch-example.sql
@@ -50,70 +50,42 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 -- ==================================================
 -- 1. Data preparation
 -- ==================================================
-
 -- Create a database which we will use to prepare data for GDS.
 CREATE DATABASE IF NOT EXISTS tpch_example;
 CREATE SCHEMA IF NOT EXISTS tpch_example.gds;
 USE SCHEMA tpch_example.gds;
 
--- GDS expects the data to be in a specific format: a table/view for nodes and a table/view for relationships.
--- In addition, GDS requires node identifiers to be globally unique integers.
+-- GDS reads data from tables that represent nodes and relationships.
+-- Nodes are usually represented by entity tables, like persons or products.
+-- Relationships are foreign keys between entity tables (1:1, 1:n) or via mapping tables (n:m).
+-- In addition, GDS expects certain naming conventions on column names.
+-- If the data is not yet in the right format, we can use views to get there.
 --
--- For our analysis, the nodes will be parts and the orders in which they appeared.
+-- For our analysis, we will use two different types of nodes: parts and orders.
+-- We want to find similar parts by looking at the orders in which they appeared.
 -- The relationships will be the line items linking a part to an order.
---
--- We start by creating the node view for our graph.
--- First we need to map the primary keys for parts and orders to globally unique node ids.
-
--- We use a sequence to generate globally unique node identifiers.
-CREATE OR REPLACE SEQUENCE global_id START = 0 INCREMENT = 1;
-
--- We create two mapping tables, one for parts and one for orders.
--- This is necessary because the primary key sets for both tables might overlap.
-CREATE OR REPLACE TABLE node_mapping_parts(gdsId, p_partkey) AS
-    SELECT global_id.nextval, p_partkey
-    FROM snowflake_sample_data.tpch_sf1.part;
-CREATE OR REPLACE TABLE node_mapping_orders(gdsId, o_orderkey) AS
-    SELECT global_id.nextval, o_orderkey
-    FROM snowflake_sample_data.tpch_sf1.orders;
-
--- Next, we can create the final node view that we use for our graph projection.
--- Note, that the view must contain a column named "nodeId" to be recognized by GDS.
--- Any additional column will be used as node property, but we don't need that for this example.
-CREATE OR REPLACE VIEW nodes(nodeId) AS
-    SELECT nmp.gdsId FROM node_mapping_parts nmp
-    UNION
-    SELECT nmo.gdsId FROM node_mapping_orders nmo;
-
--- Let's quickly verify the cardinality of our views.
--- As it is the union of parts and orders, we expect 1,700,000 rows.
-SELECT count(*) FROM nodes;
-
--- We can now create the relationship view.
--- As mentioned earlier, we will use the line items to create relationships between parts and orders.
--- We join the line items with parts and orders to get the source and target nodes for our relationships.
--- We also join the mapping tables to get the globally unique node ids.
--- Note, that the view must contain columns named "sourceNodeId" and "targetNodeId" to be recognized by GDS.
--- Any additional column will be used as relationship property, but we don't need that for this example.
-CREATE OR REPLACE VIEW relationships(sourceNodeId, targetNodeId) AS
-    SELECT 
-        nmp.gdsId AS sourceNodeId, 
-        nmo.gdsId AS targetNodeId
-    FROM snowflake_sample_data.tpch_sf1.part p
-        -- The first two joins build the relationships between parts and orders
-        JOIN snowflake_sample_data.tpch_sf1.lineitem l
-          ON p.p_partkey = l.l_partkey
-        JOIN snowflake_sample_data.tpch_sf1.orders o
-          ON o.o_orderkey = l.l_orderkey
-        -- The second two joins map the primary keys to globally unique node ids
-        JOIN node_mapping_parts nmp
-          ON nmp.p_partkey = p.p_partkey
-        JOIN node_mapping_orders nmo
-          ON nmo.o_orderkey = o.o_orderkey;
-
--- Let's quickly verify the cardinality of our relationship view.
--- As it is the join of parts, line items, and orders, we expect 6,001,215 rows.
-SELECT count(*) FROM relationships;
+-- The result will be a new table containing pairs of parts including their similarity score.
+
+-- We start by creating two views to represent our node tables.
+-- GDS requires a node table to contain a 'nodeId' column.
+-- Since we do not need any node properties, this will be the only column we project.
+-- Note, that the `nodeId` column is used to uniquely identify a node in the table.
+-- The uniqueness is usually achieved by using the primary key in that table, here 'p_partkey'.
+CREATE OR REPLACE VIEW parts (nodeId) AS
+SELECT p.p_partkey AS nodeId FROM snowflake_sample_data.tpch_sf1.part p;
+
+-- We do the same for the orders by projecting the `o_orderkey` to 'nodeId'.
+CREATE OR REPLACE VIEW orders (nodeId) AS
+SELECT o.o_orderkey AS nodeId FROM snowflake_sample_data.tpch_sf1.orders o;
+
+-- The line items represent the relationship between parts and orders.
+-- GDS requires a `sourceNodeId` and a `targetNodeId` column to identify.
+-- Here, a part is the source of a relationship and an order is the target.
+CREATE OR REPLACE VIEW part_in_order(sourceNodeId, targetNodeId) AS
+SELECT
+    l.l_partkey AS sourceNodeId,
+    l.l_orderkey AS targetNodeId
+FROM snowflake_sample_data.tpch_sf1.lineitem l;
 
 -- We have now prepared the data for GDS.
 
@@ -127,8 +99,8 @@ USE DATABASE Neo4j_GDS;
 -- Next, we want to consider the warehouse that the GDS application will use to execute queries.
 -- For this example a MEDIUM size warehouse, so we configure the application's warehouse accordingly
 ALTER WAREHOUSE Neo4j_GDS_app_warehouse SET WAREHOUSE_SIZE='MEDIUM';
--- A highly performant warehouse will speed up graph projections but does not affect algorithm computation.
--- It can therefore be a good idea to alter the warehouse size and make other configuration changes to increase performance when projecting larger amounts of data.
+-- A highly performant warehouse can speed up graph projections but does not affect algorithm computation.
+-- Especially if the views are more complex than shown in this example, a more performant warehouse is beneficial.
 -- The warehouse can then be brought back to a less expensive configuration after the projection is done.
 -- ALTER WAREHOUSE Neo4j_GDS_app_warehouse
 --   WAREHOUSE_SIZE='X-SMALL';
@@ -169,12 +141,26 @@ CALL gds.create_session('CPU_X64_L');
 
 -- Once the session is started, we can project our node and relationship views into a GDS in-memory graph.
 -- The graph will be identified by the name "parts_in_orders".
--- The mandatory parameters are the node table and the relationship table, which we point those to our prepared views.
+-- The mandatory parameters are the node tables and the relationship tables.
+-- A node table mapping points from a table/view to a node label that is used in the GDS graph.
+-- For example, the rows of 'tpch_example.gds.parts' will be nodes labeles as 'Part'.
+-- Relationship tables need a bit more configuration.
+-- Besides the type that is used in the GDS graph, here 'PART_IN_ORDER', we also need to specify source and target tables.
 -- We also specify the optional read concurrency to optimize building the graph projection.
 -- The concurrency can be set to the number of cores available on the compute pool node.
 SELECT gds.graph_project('parts_in_orders', {
-    'nodeTable':         'tpch_example.gds.nodes',
-    'relationshipTable': 'tpch_example.gds.relationships',
+    'nodeTables': {
+        'tpch_example.gds.parts':  'Part',
+        'tpch_example.gds.orders': 'Order'
+    },
+    'relationshipTables': {
+        'tpch_example.gds.part_in_order': {
+            'type': 'PART_IN_ORDER',
+            'source_table': 'tpch_example.gds.parts',
+            'target_table': 'tpch_example.gds.orders',
+            'orientation':  'NATURAL'
+        }
+    },
     'readConcurrency':   28
 });
 
@@ -192,8 +178,10 @@ SELECT gds.node_similarity('parts_in_orders', {
 
 -- Once the algorithm has finished, we can write the results back to Snowflake tables for further analysis.
 -- We want to write back the similarity relationships between parts.
--- The specified table will contain the globally unique source and target node ids and the similarity score.
+-- The specified table will contain the original source and target node ids and the similarity score.
 SELECT gds.write_relationships('parts_in_orders', {
+    'sourceLabel':          'Part',
+    'targetLabel':          'Part',
     'relationshipType':     'SIMILAR_TO',
     'relationshipProperty': 'similarity',
     'table':                'tpch_example.gds.part_similar_to_part'
@@ -208,19 +196,13 @@ GRANT SELECT ON tpch_example.gds.part_similar_to_part TO ROLE <your_role>;
 -- Simply speaking, this could be used as a recommendation system for parts.
 SELECT DISTINCT p_source.p_name, p_target.p_name, sim.similarity
 FROM snowflake_sample_data.tpch_sf1.part p_source
-JOIN tpch_example.gds.node_mapping_parts nmp_source
-  ON p_source.p_partkey = nmp_source.p_partkey
-JOIN tpch_example.gds.part_similar_to_part sim
-  ON nmp_source.gdsid = sim.sourcenodeid
-JOIN tpch_example.gds.node_mapping_parts nmp_target
-  ON sim.targetnodeid = nmp_target.gdsid
-JOIN snowflake_sample_data.tpch_sf1.part p_target
-  ON nmp_target.p_partkey = p_target.p_partkey
-ORDER BY sim.similarity DESC
-LIMIT 10;
+    JOIN tpch_example.gds.part_similar_to_part sim
+        ON p_source.p_partkey = sim.sourcenodeid
+    JOIN snowflake_sample_data.tpch_sf1.part p_target
+        ON p_target.p_partkey = sim.targetnodeid
+ORDER BY sim.similarity DESC LIMIT 10;
 
 -- The GDS service is a long-running service and should be stopped when not in use.
 -- Once we completed our analysis, we can stop the session, which suspends the container service.
 -- We can restart the session at any time to continue our analysis.
-CALL Neo4j_GDS.gds.stop_session();
-
+CALL gds.stop_session();