@@ -50,70 +50,42 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
50
50
-- ==================================================
51
51
-- 1. Data preparation
52
52
-- ==================================================
53
-
54
53
-- Create a database which we will use to prepare data for GDS.
55
54
CREATE DATABASE IF NOT EXISTS tpch_example;
56
55
CREATE SCHEMA IF NOT EXISTS tpch_example .gds ;
57
56
USE SCHEMA tpch_example .gds ;
58
57
59
- -- GDS expects the data to be in a specific format: a table/view for nodes and a table/view for relationships.
60
- -- In addition, GDS requires node identifiers to be globally unique integers.
58
+ -- GDS reads data from tables that represent nodes and relationships.
59
+ -- Nodes are usually represented by entity tables, like persons or products.
60
+ -- Relationships are foreign keys between entity tables (1:1, 1:n) or via mapping tables (n:m).
61
+ -- In addition, GDS expects certain naming conventions on column names.
62
+ -- If the data is not yet in the right format, we can use views to get there.
61
63
--
62
- -- For our analysis, the nodes will be parts and the orders in which they appeared.
64
+ -- For our analysis, we will use two different types of nodes: parts and orders.
65
+ -- We want to find similar parts by looking at the orders in which they appeared.
63
66
-- The relationships will be the line items linking a part to an order.
64
- --
65
- -- We start by creating the node view for our graph.
66
- -- First we need to map the primary keys for parts and orders to globally unique node ids.
67
-
68
- -- We use a sequence to generate globally unique node identifiers.
69
- CREATE OR REPLACE SEQUENCE global_id START = 0 INCREMENT = 1;
70
-
71
- -- We create two mapping tables, one for parts and one for orders.
72
- -- This is necessary because the primary key sets for both tables might overlap.
73
- CREATE OR REPLACE TABLE node_mapping_parts(gdsId, p_partkey) AS
74
- SELECT global_id.nextval, p_partkey
75
- FROM snowflake_sample_data.tpch_sf1.part;
76
- CREATE OR REPLACE TABLE node_mapping_orders(gdsId, o_orderkey) AS
77
- SELECT global_id.nextval, o_orderkey
78
- FROM snowflake_sample_data.tpch_sf1.orders;
79
-
80
- -- Next, we can create the final node view that we use for our graph projection.
81
- -- Note, that the view must contain a column named "nodeId" to be recognized by GDS.
82
- -- Any additional column will be used as node property, but we don't need that for this example.
83
- CREATE OR REPLACE VIEW nodes(nodeId) AS
84
- SELECT nmp.gdsId FROM node_mapping_parts nmp
85
- UNION
86
- SELECT nmo.gdsId FROM node_mapping_orders nmo;
87
-
88
- -- Let's quickly verify the cardinality of our views.
89
- -- As it is the union of parts and orders, we expect 1,700,000 rows.
90
- SELECT count(*) FROM nodes;
91
-
92
- -- We can now create the relationship view.
93
- -- As mentioned earlier, we will use the line items to create relationships between parts and orders.
94
- -- We join the line items with parts and orders to get the source and target nodes for our relationships.
95
- -- We also join the mapping tables to get the globally unique node ids.
96
- -- Note, that the view must contain columns named "sourceNodeId" and "targetNodeId" to be recognized by GDS.
97
- -- Any additional column will be used as relationship property, but we don't need that for this example.
98
- CREATE OR REPLACE VIEW relationships(sourceNodeId, targetNodeId) AS
99
- SELECT
100
- nmp.gdsId AS sourceNodeId,
101
- nmo.gdsId AS targetNodeId
102
- FROM snowflake_sample_data.tpch_sf1.part p
103
- -- The first two joins build the relationships between parts and orders
104
- JOIN snowflake_sample_data.tpch_sf1.lineitem l
105
- ON p.p_partkey = l.l_partkey
106
- JOIN snowflake_sample_data.tpch_sf1.orders o
107
- ON o.o_orderkey = l.l_orderkey
108
- -- The second two joins map the primary keys to globally unique node ids
109
- JOIN node_mapping_parts nmp
110
- ON nmp.p_partkey = p.p_partkey
111
- JOIN node_mapping_orders nmo
112
- ON nmo.o_orderkey = o.o_orderkey;
113
-
114
- -- Let's quickly verify the cardinality of our relationship view.
115
- -- As it is the join of parts, line items, and orders, we expect 6,001,215 rows.
116
- SELECT count(*) FROM relationships;
67
+ -- The result will be a new table containing pairs of parts including their similarity score.
68
+
69
+ -- We start by creating two views to represent our node tables.
70
+ -- GDS requires a node table to contain a 'nodeId' column.
71
+ -- Since we do not need any node properties, this will be the only column we project.
72
+ -- Note, that the `nodeId` column is used to uniquely identify a node in the table.
73
+ -- The uniqueness is usually achieved by using the primary key in that table, here 'p_partkey'.
74
+ CREATE OR REPLACE VIEW parts (nodeId) AS
75
+ SELECT p .p_partkey AS nodeId FROM snowflake_sample_data .tpch_sf1 .part p;
76
+
77
+ -- We do the same for the orders by projecting the `o_orderkey` to 'nodeId'.
78
+ CREATE OR REPLACE VIEW orders (nodeId) AS
79
+ SELECT o .o_orderkey AS nodeId FROM snowflake_sample_data .tpch_sf1 .orders o;
80
+
81
+ -- The line items represent the relationship between parts and orders.
82
+ -- GDS requires a `sourceNodeId` and a `targetNodeId` column to identify.
83
+ -- Here, a part is the source of a relationship and an order is the target.
84
+ CREATE OR REPLACE VIEW part_in_order (sourceNodeId, targetNodeId) AS
85
+ SELECT
86
+ l .l_partkey AS sourceNodeId,
87
+ l .l_orderkey AS targetNodeId
88
+ FROM snowflake_sample_data .tpch_sf1 .lineitem l;
117
89
118
90
-- We have now prepared the data for GDS.
119
91
@@ -127,8 +99,8 @@ USE DATABASE Neo4j_GDS;
127
99
-- Next, we want to consider the warehouse that the GDS application will use to execute queries.
128
100
-- For this example a MEDIUM size warehouse, so we configure the application's warehouse accordingly
129
101
ALTER WAREHOUSE Neo4j_GDS_app_warehouse SET WAREHOUSE_SIZE= ' MEDIUM' ;
130
- -- A highly performant warehouse will speed up graph projections but does not affect algorithm computation.
131
- -- It can therefore be a good idea to alter the warehouse size and make other configuration changes to increase performance when projecting larger amounts of data .
102
+ -- A highly performant warehouse can speed up graph projections but does not affect algorithm computation.
103
+ -- Especially if the views are more complex than shown in this example, a more performant warehouse is beneficial .
132
104
-- The warehouse can then be brought back to a less expensive configuration after the projection is done.
133
105
-- ALTER WAREHOUSE Neo4j_GDS_app_warehouse
134
106
-- WAREHOUSE_SIZE='X-SMALL';
@@ -169,12 +141,26 @@ CALL gds.create_session('CPU_X64_L');
169
141
170
142
-- Once the session is started, we can project our node and relationship views into a GDS in-memory graph.
171
143
-- The graph will be identified by the name "parts_in_orders".
172
- -- The mandatory parameters are the node table and the relationship table, which we point those to our prepared views.
144
+ -- The mandatory parameters are the node tables and the relationship tables.
145
+ -- A node table mapping points from a table/view to a node label that is used in the GDS graph.
146
+ -- For example, the rows of 'tpch_example.gds.parts' will be nodes labeles as 'Part'.
147
+ -- Relationship tables need a bit more configuration.
148
+ -- Besides the type that is used in the GDS graph, here 'PART_IN_ORDER', we also need to specify source and target tables.
173
149
-- We also specify the optional read concurrency to optimize building the graph projection.
174
150
-- The concurrency can be set to the number of cores available on the compute pool node.
175
151
SELECT gds .graph_project (' parts_in_orders' , {
176
- 'nodeTable': 'tpch_example.gds.nodes',
177
- 'relationshipTable': 'tpch_example.gds.relationships',
152
+ ' nodeTables' : {
153
+ ' tpch_example.gds.parts' : ' Part' ,
154
+ ' tpch_example.gds.orders' : ' Order'
155
+ },
156
+ ' relationshipTables' : {
157
+ ' tpch_example.gds.part_in_order' : {
158
+ ' type' : ' PART_IN_ORDER' ,
159
+ ' source_table' : ' tpch_example.gds.parts' ,
160
+ ' target_table' : ' tpch_example.gds.orders' ,
161
+ ' orientation' : ' NATURAL'
162
+ }
163
+ },
178
164
' readConcurrency' : 28
179
165
});
180
166
@@ -192,8 +178,10 @@ SELECT gds.node_similarity('parts_in_orders', {
192
178
193
179
-- Once the algorithm has finished, we can write the results back to Snowflake tables for further analysis.
194
180
-- We want to write back the similarity relationships between parts.
195
- -- The specified table will contain the globally unique source and target node ids and the similarity score.
181
+ -- The specified table will contain the original source and target node ids and the similarity score.
196
182
SELECT gds .write_relationships (' parts_in_orders' , {
183
+ ' sourceLabel' : ' Part' ,
184
+ ' targetLabel' : ' Part' ,
197
185
' relationshipType' : ' SIMILAR_TO' ,
198
186
' relationshipProperty' : ' similarity' ,
199
187
' table' : ' tpch_example.gds.part_similar_to_part'
@@ -208,19 +196,13 @@ GRANT SELECT ON tpch_example.gds.part_similar_to_part TO ROLE <your_role>;
208
196
-- Simply speaking, this could be used as a recommendation system for parts.
209
197
SELECT DISTINCT p_source .p_name , p_target .p_name , sim .similarity
210
198
FROM snowflake_sample_data .tpch_sf1 .part p_source
211
- JOIN tpch_example.gds.node_mapping_parts nmp_source
212
- ON p_source.p_partkey = nmp_source.p_partkey
213
- JOIN tpch_example.gds.part_similar_to_part sim
214
- ON nmp_source.gdsid = sim.sourcenodeid
215
- JOIN tpch_example.gds.node_mapping_parts nmp_target
216
- ON sim.targetnodeid = nmp_target.gdsid
217
- JOIN snowflake_sample_data.tpch_sf1.part p_target
218
- ON nmp_target.p_partkey = p_target.p_partkey
219
- ORDER BY sim.similarity DESC
220
- LIMIT 10;
199
+ JOIN tpch_example .gds .part_similar_to_part sim
200
+ ON p_source .p_partkey = sim .sourcenodeid
201
+ JOIN snowflake_sample_data .tpch_sf1 .part p_target
202
+ ON p_target .p_partkey = sim .targetnodeid
203
+ ORDER BY sim .similarity DESC LIMIT 10 ;
221
204
222
205
-- The GDS service is a long-running service and should be stopped when not in use.
223
206
-- Once we completed our analysis, we can stop the session, which suspends the container service.
224
207
-- We can restart the session at any time to continue our analysis.
225
- CALL Neo4j_GDS.gds.stop_session();
226
-
208
+ CALL gds .stop_session ();
0 commit comments