Skip to content

Commit 26839c6

Browse files
committed
Adapt tpch-example to multitable
1 parent 3fab734 commit 26839c6

File tree

1 file changed

+57
-75
lines changed

1 file changed

+57
-75
lines changed

snowgraph/tpch-example.sql

Lines changed: 57 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -50,70 +50,42 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
5050
-- ==================================================
5151
-- 1. Data preparation
5252
-- ==================================================
53-
5453
-- Create a database which we will use to prepare data for GDS.
5554
CREATE DATABASE IF NOT EXISTS tpch_example;
5655
CREATE SCHEMA IF NOT EXISTS tpch_example.gds;
5756
USE SCHEMA tpch_example.gds;
5857

59-
-- GDS expects the data to be in a specific format: a table/view for nodes and a table/view for relationships.
60-
-- In addition, GDS requires node identifiers to be globally unique integers.
58+
-- GDS reads data from tables that represent nodes and relationships.
59+
-- Nodes are usually represented by entity tables, like persons or products.
60+
-- Relationships are foreign keys between entity tables (1:1, 1:n) or via mapping tables (n:m).
61+
-- In addition, GDS expects certain naming conventions on column names.
62+
-- If the data is not yet in the right format, we can use views to get there.
6163
--
62-
-- For our analysis, the nodes will be parts and the orders in which they appeared.
64+
-- For our analysis, we will use two different types of nodes: parts and orders.
65+
-- We want to find similar parts by looking at the orders in which they appeared.
6366
-- The relationships will be the line items linking a part to an order.
64-
--
65-
-- We start by creating the node view for our graph.
66-
-- First we need to map the primary keys for parts and orders to globally unique node ids.
67-
68-
-- We use a sequence to generate globally unique node identifiers.
69-
CREATE OR REPLACE SEQUENCE global_id START = 0 INCREMENT = 1;
70-
71-
-- We create two mapping tables, one for parts and one for orders.
72-
-- This is necessary because the primary key sets for both tables might overlap.
73-
CREATE OR REPLACE TABLE node_mapping_parts(gdsId, p_partkey) AS
74-
SELECT global_id.nextval, p_partkey
75-
FROM snowflake_sample_data.tpch_sf1.part;
76-
CREATE OR REPLACE TABLE node_mapping_orders(gdsId, o_orderkey) AS
77-
SELECT global_id.nextval, o_orderkey
78-
FROM snowflake_sample_data.tpch_sf1.orders;
79-
80-
-- Next, we can create the final node view that we use for our graph projection.
81-
-- Note, that the view must contain a column named "nodeId" to be recognized by GDS.
82-
-- Any additional column will be used as node property, but we don't need that for this example.
83-
CREATE OR REPLACE VIEW nodes(nodeId) AS
84-
SELECT nmp.gdsId FROM node_mapping_parts nmp
85-
UNION
86-
SELECT nmo.gdsId FROM node_mapping_orders nmo;
87-
88-
-- Let's quickly verify the cardinality of our views.
89-
-- As it is the union of parts and orders, we expect 1,700,000 rows.
90-
SELECT count(*) FROM nodes;
91-
92-
-- We can now create the relationship view.
93-
-- As mentioned earlier, we will use the line items to create relationships between parts and orders.
94-
-- We join the line items with parts and orders to get the source and target nodes for our relationships.
95-
-- We also join the mapping tables to get the globally unique node ids.
96-
-- Note, that the view must contain columns named "sourceNodeId" and "targetNodeId" to be recognized by GDS.
97-
-- Any additional column will be used as relationship property, but we don't need that for this example.
98-
CREATE OR REPLACE VIEW relationships(sourceNodeId, targetNodeId) AS
99-
SELECT
100-
nmp.gdsId AS sourceNodeId,
101-
nmo.gdsId AS targetNodeId
102-
FROM snowflake_sample_data.tpch_sf1.part p
103-
-- The first two joins build the relationships between parts and orders
104-
JOIN snowflake_sample_data.tpch_sf1.lineitem l
105-
ON p.p_partkey = l.l_partkey
106-
JOIN snowflake_sample_data.tpch_sf1.orders o
107-
ON o.o_orderkey = l.l_orderkey
108-
-- The second two joins map the primary keys to globally unique node ids
109-
JOIN node_mapping_parts nmp
110-
ON nmp.p_partkey = p.p_partkey
111-
JOIN node_mapping_orders nmo
112-
ON nmo.o_orderkey = o.o_orderkey;
113-
114-
-- Let's quickly verify the cardinality of our relationship view.
115-
-- As it is the join of parts, line items, and orders, we expect 6,001,215 rows.
116-
SELECT count(*) FROM relationships;
67+
-- The result will be a new table containing pairs of parts including their similarity score.
68+
69+
-- We start by creating two views to represent our node tables.
70+
-- GDS requires a node table to contain a 'nodeId' column.
71+
-- Since we do not need any node properties, this will be the only column we project.
72+
-- Note, that the `nodeId` column is used to uniquely identify a node in the table.
73+
-- The uniqueness is usually achieved by using the primary key in that table, here 'p_partkey'.
74+
CREATE OR REPLACE VIEW parts (nodeId) AS
75+
SELECT p.p_partkey AS nodeId FROM snowflake_sample_data.tpch_sf1.part p;
76+
77+
-- We do the same for the orders by projecting the `o_orderkey` to 'nodeId'.
78+
CREATE OR REPLACE VIEW orders (nodeId) AS
79+
SELECT o.o_orderkey AS nodeId FROM snowflake_sample_data.tpch_sf1.orders o;
80+
81+
-- The line items represent the relationship between parts and orders.
82+
-- GDS requires a `sourceNodeId` and a `targetNodeId` column to identify.
83+
-- Here, a part is the source of a relationship and an order is the target.
84+
CREATE OR REPLACE VIEW part_in_order(sourceNodeId, targetNodeId) AS
85+
SELECT
86+
l.l_partkey AS sourceNodeId,
87+
l.l_orderkey AS targetNodeId
88+
FROM snowflake_sample_data.tpch_sf1.lineitem l;
11789

11890
-- We have now prepared the data for GDS.
11991

@@ -127,8 +99,8 @@ USE DATABASE Neo4j_GDS;
12799
-- Next, we want to consider the warehouse that the GDS application will use to execute queries.
128100
-- For this example a MEDIUM size warehouse, so we configure the application's warehouse accordingly
129101
ALTER WAREHOUSE Neo4j_GDS_app_warehouse SET WAREHOUSE_SIZE='MEDIUM';
130-
-- A highly performant warehouse will speed up graph projections but does not affect algorithm computation.
131-
-- It can therefore be a good idea to alter the warehouse size and make other configuration changes to increase performance when projecting larger amounts of data.
102+
-- A highly performant warehouse can speed up graph projections but does not affect algorithm computation.
103+
-- Especially if the views are more complex than shown in this example, a more performant warehouse is beneficial.
132104
-- The warehouse can then be brought back to a less expensive configuration after the projection is done.
133105
-- ALTER WAREHOUSE Neo4j_GDS_app_warehouse
134106
-- WAREHOUSE_SIZE='X-SMALL';
@@ -169,12 +141,26 @@ CALL gds.create_session('CPU_X64_L');
169141

170142
-- Once the session is started, we can project our node and relationship views into a GDS in-memory graph.
171143
-- The graph will be identified by the name "parts_in_orders".
172-
-- The mandatory parameters are the node table and the relationship table, which we point those to our prepared views.
144+
-- The mandatory parameters are the node tables and the relationship tables.
145+
-- A node table mapping points from a table/view to a node label that is used in the GDS graph.
146+
-- For example, the rows of 'tpch_example.gds.parts' will be nodes labeles as 'Part'.
147+
-- Relationship tables need a bit more configuration.
148+
-- Besides the type that is used in the GDS graph, here 'PART_IN_ORDER', we also need to specify source and target tables.
173149
-- We also specify the optional read concurrency to optimize building the graph projection.
174150
-- The concurrency can be set to the number of cores available on the compute pool node.
175151
SELECT gds.graph_project('parts_in_orders', {
176-
'nodeTable': 'tpch_example.gds.nodes',
177-
'relationshipTable': 'tpch_example.gds.relationships',
152+
'nodeTables': {
153+
'tpch_example.gds.parts': 'Part',
154+
'tpch_example.gds.orders': 'Order'
155+
},
156+
'relationshipTables': {
157+
'tpch_example.gds.part_in_order': {
158+
'type': 'PART_IN_ORDER',
159+
'source_table': 'tpch_example.gds.parts',
160+
'target_table': 'tpch_example.gds.orders',
161+
'orientation': 'NATURAL'
162+
}
163+
},
178164
'readConcurrency': 28
179165
});
180166

@@ -192,8 +178,10 @@ SELECT gds.node_similarity('parts_in_orders', {
192178

193179
-- Once the algorithm has finished, we can write the results back to Snowflake tables for further analysis.
194180
-- We want to write back the similarity relationships between parts.
195-
-- The specified table will contain the globally unique source and target node ids and the similarity score.
181+
-- The specified table will contain the original source and target node ids and the similarity score.
196182
SELECT gds.write_relationships('parts_in_orders', {
183+
'sourceLabel': 'Part',
184+
'targetLabel': 'Part',
197185
'relationshipType': 'SIMILAR_TO',
198186
'relationshipProperty': 'similarity',
199187
'table': 'tpch_example.gds.part_similar_to_part'
@@ -208,19 +196,13 @@ GRANT SELECT ON tpch_example.gds.part_similar_to_part TO ROLE <your_role>;
208196
-- Simply speaking, this could be used as a recommendation system for parts.
209197
SELECT DISTINCT p_source.p_name, p_target.p_name, sim.similarity
210198
FROM snowflake_sample_data.tpch_sf1.part p_source
211-
JOIN tpch_example.gds.node_mapping_parts nmp_source
212-
ON p_source.p_partkey = nmp_source.p_partkey
213-
JOIN tpch_example.gds.part_similar_to_part sim
214-
ON nmp_source.gdsid = sim.sourcenodeid
215-
JOIN tpch_example.gds.node_mapping_parts nmp_target
216-
ON sim.targetnodeid = nmp_target.gdsid
217-
JOIN snowflake_sample_data.tpch_sf1.part p_target
218-
ON nmp_target.p_partkey = p_target.p_partkey
219-
ORDER BY sim.similarity DESC
220-
LIMIT 10;
199+
JOIN tpch_example.gds.part_similar_to_part sim
200+
ON p_source.p_partkey = sim.sourcenodeid
201+
JOIN snowflake_sample_data.tpch_sf1.part p_target
202+
ON p_target.p_partkey = sim.targetnodeid
203+
ORDER BY sim.similarity DESC LIMIT 10;
221204

222205
-- The GDS service is a long-running service and should be stopped when not in use.
223206
-- Once we completed our analysis, we can stop the session, which suspends the container service.
224207
-- We can restart the session at any time to continue our analysis.
225-
CALL Neo4j_GDS.gds.stop_session();
226-
208+
CALL gds.stop_session();

0 commit comments

Comments
 (0)