Skip to content

Commit 8cb290f

Browse files
committed
Add examples for GFD mining
Added two examples with searching for dependencies in small graphs.
1 parent 48c7132 commit 8cb290f

10 files changed

+307
-0
lines changed
Loading
Loading
Loading
Loading
Loading
Loading
+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
from pathlib import Path
2+
3+
import desbordante
4+
import matplotlib.pyplot as plt
5+
import matplotlib.image as mpimg
6+
7+
8+
class bcolors:
9+
ARTICLE = '\033[38;2;173;255;47m'
10+
PERSON = '\033[38;2;46;139;87m'
11+
HEADER = '\033[95m'
12+
WARNING = '\033[93m'
13+
ENDC = '\033[0m'
14+
15+
16+
def colored(message, color):
17+
return color + message + bcolors.ENDC
18+
19+
20+
GRAPH_NAME = 'papers_graph'
21+
EMBEDDINGS_NAME = 'papers_embeddings'
22+
GFD_NAME = 'papers_gfd'
23+
24+
GRAPHS_DATASETS_FOLDER_PATH = 'examples/datasets/mining_gfd'
25+
26+
GRAPH = Path(f'{GRAPHS_DATASETS_FOLDER_PATH}/{GRAPH_NAME}.dot')
27+
28+
GRAPH_IMAGE = Path(f'examples/basic/mining_gfd/figures/graphs/{GRAPH_NAME}.png')
29+
EMBEDDINGS_IMAGE = Path(f'examples/basic/mining_gfd/figures/graphs/{EMBEDDINGS_NAME}.png')
30+
GFD_IMAGE = Path(f'examples/basic/mining_gfd/figures/gfds/{GFD_NAME}.png')
31+
32+
PREAMBLE = ("Our profiler supports two tasks related to graph functional dependencies (GFDs): "
33+
"validation and mining (discovery). In this example, we will focus on the mining "
34+
"task (for validation, we refer the reader to another example). The mining algorithm "
35+
"used in our profiler is described in the article \"Discovering Graph Functional "
36+
"Dependencies\" by Fan Wenfei, Hu Chunming, Liu Xueli, and Lu Pinge, presented at SIGMOD '18.\n")
37+
38+
GFD_INFO = ("GFDs are functional dependencies that consist of a pattern - a graph that specifies the "
39+
"scope - and a rule. The nature of this object will become clearer through the "
40+
"example that follows.\n")
41+
42+
GRAPH_INFO = ("Let's analyze GFD mining through an example. Look at the graph "
43+
"presented on the top left in the figure. It describes the connections "
44+
"between scientific articles and their authors. The vertices of this "
45+
f"graph have two labels: {colored('Article (A)', bcolors.ARTICLE)} and "
46+
f"{colored('Person (P)', bcolors.PERSON)}. Each vertex has its own set "
47+
"of attributes depending on the label.\n\n"
48+
f"{colored('Article', bcolors.ARTICLE)}:\n- {colored('title', bcolors.ARTICLE)}"
49+
" denotes the title of the article.\n\n"
50+
f"{colored('Person', bcolors.PERSON)}:\n- {colored('name', bcolors.PERSON)}"
51+
f" denotes the name of a person,\n- {colored('role', bcolors.PERSON)}"
52+
" can take one of two values: \"teacher\" or \"student\".\n")
53+
54+
ALGO_INFO = ("The discovery algorithm, in addition to the graph, takes two parameters as input:\n"
55+
"- k: the maximum number of vertices in the pattern,\n"
56+
"- sigma: the minimum frequency of GFD occurrences in the original graph.\n")
57+
58+
INFO = "Let's run the algorithm and look at the result. We will set k=3 and sigma=2.\n"
59+
60+
REWRITING = ("It may be difficult to interpret, so let's rewrite it to a more human-readable "
61+
"format. Note that the empty line immediately following the colon (\":\") "
62+
"indicates that the left-hand side of the dependency has no conditions. "
63+
"Conversely, if the right-hand side of the dependency had no conditions, "
64+
"the second line would be empty.\n")
65+
66+
GFD_TEXT = (f' {colored("0", bcolors.ARTICLE)} {colored("1", bcolors.PERSON)}'
67+
f' {colored("2", bcolors.ARTICLE)}\n'
68+
f' {colored("(A)", bcolors.ARTICLE)}--{colored("(P)", bcolors.PERSON)}-'
69+
f'-{colored("(A)", bcolors.ARTICLE)}\n'
70+
'{} --> {' + colored("1", bcolors.PERSON) + '.' + colored("role", bcolors.PERSON) + ''
71+
'=teacher}\n\nThe mined dependency can also be seen on the right in the figure.\n')
72+
73+
RESULTS = ("The discovered dependency can be expressed as the following fact: If a person "
74+
"has two published articles, then they are a teacher.\n")
75+
76+
EXAMPLE_INFO = ('It is recommended to look at the second example for a deeper '
77+
'understanding of graph functional dependency mining. It is '
78+
'located in the file "mining_gfd2.py".\n')
79+
80+
EXIT = colored("Close the image window to finish.", bcolors.WARNING)
81+
82+
83+
def execute_algo(algo):
84+
algo.load_data(graph=GRAPH, gfd_k=3, gfd_sigma=2)
85+
algo.execute()
86+
print(f'{bcolors.HEADER}Desbordante > {bcolors.ENDC}', end='')
87+
print('Mined GFDs:', len(algo.get_gfds()))
88+
print()
89+
print("Let's print found dependency (in DOT language):")
90+
for gfd in algo.get_gfds():
91+
print(gfd)
92+
print(REWRITING)
93+
print(GFD_TEXT)
94+
95+
96+
def show_example():
97+
fig, axarr = plt.subplots(2, 2, figsize=(14, 6), gridspec_kw={'width_ratios': [7, 3], 'wspace': 0.5})
98+
gs = axarr[0, 1].get_gridspec()
99+
for ax in axarr[:, 1]:
100+
ax.remove()
101+
axsbig = fig.add_subplot(gs[:, -1])
102+
103+
axarr[0, 0].set_axis_off()
104+
axarr[0, 0].set_title('$Original$ $graph$')
105+
axarr[0, 0].imshow(mpimg.imread(GRAPH_IMAGE))
106+
axarr[1, 0].set_axis_off()
107+
axarr[1, 0].set_title('$GFD$ $embeddings$')
108+
axarr[1, 0].imshow(mpimg.imread(EMBEDDINGS_IMAGE))
109+
axsbig.set_axis_off()
110+
axsbig.set_title('$Mined$ $GFD$')
111+
axsbig.imshow(mpimg.imread(GFD_IMAGE))
112+
plt.show()
113+
114+
115+
print(PREAMBLE)
116+
print(GFD_INFO)
117+
print(GRAPH_INFO)
118+
print(ALGO_INFO)
119+
print(INFO)
120+
execute_algo(desbordante.gfd_mining.algorithms.GfdMiner())
121+
print(RESULTS)
122+
print(EXAMPLE_INFO)
123+
print(EXIT)
124+
125+
show_example()
+129
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
from pathlib import Path
2+
3+
import desbordante
4+
import matplotlib.pyplot as plt
5+
import matplotlib.image as mpimg
6+
7+
8+
class bcolors:
9+
STUDENT = '\033[38;2;254;136;99m'
10+
TASK = '\033[38;2;87;206;235m'
11+
HEADER = '\033[95m'
12+
WARNING = '\033[93m'
13+
ENDC = '\033[0m'
14+
15+
16+
def colored(message, color):
17+
return color + message + bcolors.ENDC
18+
19+
20+
GRAPH_NAME = 'study_graph'
21+
EMBEDDINGS_NAME = 'study_embeddings'
22+
GFD_NAME = 'study_gfd'
23+
24+
GRAPHS_DATASETS_FOLDER_PATH = 'examples/datasets/mining_gfd'
25+
26+
GRAPH = Path(f'{GRAPHS_DATASETS_FOLDER_PATH}/{GRAPH_NAME}.dot')
27+
28+
GRAPH_IMAGE = Path(f'examples/basic/mining_gfd/figures/graphs/{GRAPH_NAME}.png')
29+
EMBEDDINGS_IMAGE = Path(f'examples/basic/mining_gfd/figures/graphs/{EMBEDDINGS_NAME}.png')
30+
GFD_IMAGE = Path(f'examples/basic/mining_gfd/figures/gfds/{GFD_NAME}.png')
31+
32+
PREAMBLE = ("Our profiler supports two tasks related to graph functional dependencies (GFDs): "
33+
"validation and mining (discovery). In this example, we will focus on the mining "
34+
"task (for validation, we refer the reader to another example). The mining algorithm "
35+
"used in our profiler is described in the article \"Discovering Graph Functional "
36+
"Dependencies\" by Fan Wenfei, Hu Chunming, Liu Xueli, and Lu Pinge, presented at SIGMOD '18.\n")
37+
38+
GFD_INFO = ("GFDs are functional dependencies that consist of a pattern - a graph that specifies the "
39+
"scope - and a rule. The nature of this object will become clearer through the "
40+
"example that follows.\n")
41+
42+
GRAPH_INFO = ("Let's analyze GFD mining through an example. Look at the graph "
43+
"presented on the top left in the figure. It describes the connections "
44+
"between students and tasks. The vertices of this "
45+
f"graph have two labels: {colored('Student (S)', bcolors.STUDENT)} and "
46+
f"{colored('Task (T)', bcolors.TASK)}. Each vertex has its own set "
47+
"of attributes depending on the label.\n\n"
48+
f"{colored('Student', bcolors.STUDENT)}:\n- {colored('name', bcolors.STUDENT)}"
49+
f" denotes the name of the student,\n- {colored('degree', bcolors.STUDENT)} "
50+
f"is the level of education,\n- {colored('year', bcolors.STUDENT)} "
51+
"is the year of study.\n\n"
52+
f"{colored('Task', bcolors.TASK)}:\n- {colored('name', bcolors.TASK)}"
53+
f" denotes the name of a task,\n- {colored('difficulty', bcolors.TASK)}"
54+
" is a categorical parameter that takes one of the following values: "
55+
"\"easy\", \"normal\" or \"hard\".\n")
56+
57+
ALGO_INFO = ("The discovery algorithm, in addition to the graph, takes two parameters as input:\n"
58+
"- k: the maximum number of vertices in the pattern,\n"
59+
"- sigma: the minimum frequency of GFD occurrences in the original graph.\n")
60+
61+
INFO = "Let's run the algorithm and look at the result. We will set k=2 and sigma=3.\n"
62+
63+
REWRITING = ("It may be difficult to interpret, so let's rewrite it to a more human-readable "
64+
"format. Notation: the first line contains the literals found in the left-hand side. "
65+
"The second line contains those in the right-hand side.\n")
66+
67+
GFD_TEXT = (' '
68+
f' {colored("0", bcolors.TASK)} {colored("1", bcolors.STUDENT)}\n'
69+
' '
70+
f'{colored("(T)", bcolors.TASK)}--{colored("(S)", bcolors.STUDENT)}\n'
71+
'{' + colored("0", bcolors.TASK) + '.' + colored("difficulty", bcolors.TASK) + ''
72+
'=hard} --> {' + colored("1", bcolors.STUDENT) + '.'
73+
'' + colored("degree", bcolors.STUDENT) + '=master & ' + colored("1", bcolors.STUDENT) + ''
74+
'.' + colored("year", bcolors.STUDENT) + '=2}\n\nThe mined dependency can also be '\
75+
'seen on the right in the figure.\n')
76+
77+
RESULTS = ("The dependency found indicates that only second-year master's "
78+
"students are working on the difficult task.\n")
79+
80+
EXAMPLE_INFO = ('It is recommended to look at the first example for a deeper '
81+
'understanding of graph functional dependency mining. It is '
82+
'located in the file "mining_gfd1.py".\n')
83+
84+
EXIT = colored("Close the image window to finish.", bcolors.WARNING)
85+
86+
87+
def execute_algo(algo):
88+
algo.load_data(graph=GRAPH, gfd_k=2, gfd_sigma=3)
89+
algo.execute()
90+
print(f'{bcolors.HEADER}Desbordante > {bcolors.ENDC}', end='')
91+
print('Mined GFDs:', len(algo.get_gfds()))
92+
print()
93+
print("Let's print found dependency (in DOT language):")
94+
for gfd in algo.get_gfds():
95+
print(gfd)
96+
print(REWRITING)
97+
print(GFD_TEXT)
98+
99+
100+
def show_example():
101+
fig, axarr = plt.subplots(2, 2, figsize=(16, 7), gridspec_kw={'width_ratios': [7, 3], 'wspace': 0.5})
102+
gs = axarr[0, 1].get_gridspec()
103+
for ax in axarr[:, 1]:
104+
ax.remove()
105+
axsbig = fig.add_subplot(gs[:, -1])
106+
107+
axarr[0, 0].set_axis_off()
108+
axarr[0, 0].set_title('$Original$ $graph$')
109+
axarr[0, 0].imshow(mpimg.imread(GRAPH_IMAGE))
110+
axarr[1, 0].set_axis_off()
111+
axarr[1, 0].set_title('$GFD$ $embeddings$')
112+
axarr[1, 0].imshow(mpimg.imread(EMBEDDINGS_IMAGE))
113+
axsbig.set_axis_off()
114+
axsbig.set_title('$Mined$ $GFD$')
115+
axsbig.imshow(mpimg.imread(GFD_IMAGE))
116+
plt.show()
117+
118+
119+
print(PREAMBLE)
120+
print(GFD_INFO)
121+
print(GRAPH_INFO)
122+
print(ALGO_INFO)
123+
print(INFO)
124+
execute_algo(desbordante.gfd_mining.algorithms.GfdMiner())
125+
print(RESULTS)
126+
print(EXAMPLE_INFO)
127+
print(EXIT)
128+
129+
show_example()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
graph G {
2+
0[label=article title="Exploring the Impact of Climate Change on Marine Biodiversity"];
3+
1[label=article title="Advancements in Quantum Computing: A New Era of Information Processing"];
4+
2[label=article title="The Role of Artificial Intelligence in Modern Healthcare Systems"];
5+
3[label=article title="Understanding the Genetic Basis of Resilience in Plant Species"];
6+
4[label=person name="Emily Carter" role=teacher];
7+
5[label=person name="James Thompson" role=student];
8+
6[label=person name="Sophia Martinez" role=teacher];
9+
7[label=person name="Liam Johnson" role=student];
10+
8[label=person name="Ava Patel" role=student];
11+
0--4 [label="*"];
12+
1--4 [label="*"];
13+
1--5 [label="*"];
14+
1--6 [label="*"];
15+
2--6 [label="*"];
16+
3--6 [label="*"];
17+
3--7 [label="*"];
18+
3--8 [label="*"];
19+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
graph G {
2+
0[label=task difficulty=easy];
3+
1[label=task difficulty=normal];
4+
2[label=task difficulty=normal];
5+
3[label=task difficulty=hard];
6+
4[label=task difficulty=hard];
7+
5[label=task difficulty=hard];
8+
6[label=student name=James degree=bachelor year=2];
9+
7[label=student name=Michael degree=master year=1];
10+
8[label=student name=Robert degree=bachelor year=3];
11+
9[label=student name=John degree=master year=2];
12+
10[label=student name=David degree=bachelor year=4];
13+
11[label=student name=William degree=master year=2];
14+
12[label=student name=Richard degree=master year=2];
15+
13[label=student name=Joseph degree=master year=2];
16+
14[label=student name=Thomas degree=master year=2];
17+
15[label=student name=Christopher degree=master year=2];
18+
0--6 [label=performs];
19+
1--6 [label=performs];
20+
1--7 [label=performs];
21+
1--10 [label=performs];
22+
2--7 [label=performs];
23+
2--8 [label=performs];
24+
2--9 [label=performs];
25+
3--9 [label=performs];
26+
3--11 [label=performs];
27+
3--12 [label=performs];
28+
4--12 [label=performs];
29+
4--13 [label=performs];
30+
4--14 [label=performs];
31+
5--11 [label=performs];
32+
5--14 [label=performs];
33+
5--15 [label=performs];
34+
}

0 commit comments

Comments
 (0)