Skip to content

Commit

Permalink
Merge branch 'develop' into feat/eliminate-duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
nikolas-rauscher authored Jul 3, 2024
2 parents 68033f1 + 3fdd460 commit d1e3481
Show file tree
Hide file tree
Showing 47 changed files with 3,083 additions and 1,087 deletions.
41 changes: 41 additions & 0 deletions Documentation/llmExtractionMeasurments.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Measurments of running Graph creator
# india health article

llama3, chunk_size=1500, chunk_overlap=150, 30 calls/min, 60 sec wait

gemini-pro, chunk_size=1500, chunk_overlap=150, - calls/min, - sec wait

gemini-flash, chunk_size=1500, chunk_overlap=150, - calls/min, - sec wait

# Measurments as averages over all llm calls
# two kinds of promts used: extract_entities_and_relations and check_for_connecting_relation


# Execution speed of prompts by llm model

gemini-flash: 12,75s (10 extraction requests) per request extracting, 1.57s connecting

gemini: 23,54s (10 extraction requests) per request extracting, 2,37s connecting

groq+llama3: 0.72s (10 extraction requests) per request extracting, 0,48s connecting

---------------------------------------

# Statistics on the number of extracted entities by llm model

llama: 3078 tokens / 1770 words -> 177 / 180 entities (34 / 47 connecting requests)

gemini: 3078 tokens / 1770 words -> 303 / 316 entities (35 connecting requests)

gemini-flash: 3078 tokens / 1770 words -> 309 / 369 entities (28 connecting requests)


-----------------------------------------

# Duration of knowledge graph extraction by llm model

gemini-flash: 127,5s for entity extraction and 105s for connecting

gemini: 212s for entity extraction and 189s for connecting

groq+llama3: 7,9s for entity extraction and 136s for connecting
152 changes: 44 additions & 108 deletions Project/backend/codebase/graph_analysis/graph_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@
import os
import json

def get_top_n_central_nodes(centrality_dict, n):
"""Sort nodes based on centrality measure and return top N nodes.
Args:
centrality_dict: Dictionary of nodes with their centrality values.
n: Number of top nodes to return.
Returns:
Sorted list of top N nodes with their centrality values.
"""
# sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
# return sorted_nodes[:n]
sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
return [node for node, _ in sorted_nodes[:n]]

def analyze_graph_structure(G):
"""Analyzes the structure of a knowledge graph and provides hopefully useful information.
Expand All @@ -17,15 +31,16 @@ def analyze_graph_structure(G):
# Basic Graph Statistics
num_nodes = G.number_of_nodes() # Total number of nodes
num_edges = G.number_of_edges() # Total number of edges
density = nx.density(G) # Ratio of actual edges to possible edges (0 to 1)
average_degree = 2 * num_edges / num_nodes # Average number of edges per node

# Degree Distribution
# Degree Distribution
degree_distribution = dict(G.degree())
# Degree distribution can indicate the presence of hubs or important nodes

degree_centrality = nx.degree_centrality(G)
if num_nodes == 0 or num_edges == 0:
raise ValueError("The graph is empty or not properly constructed.")

# Degree Centrality: Measures node connectivity
degree_centrality = nx.degree_centrality(G)
""" Centrality Measures
- Degree Centrality: Measures node connectivity
- Nodes with high degree centrality are important in the network
Expand All @@ -36,9 +51,10 @@ def analyze_graph_structure(G):
- Degree Centrality: node1 = 0.33(1/3), node2 = 0.66(2/3), node3 = 0.33(1/3)
"""

betweenness_centrality = nx.betweenness_centrality(G)

"""
# Betweenness Centrality: Measures node's control over information flow
betweenness_centrality = nx.betweenness_centrality(G)
"""
- Betweenness Centrality: Measures node's control over information flow
- Nodes with high betweenness centrality are important in the network
Expand All @@ -54,24 +70,8 @@ def analyze_graph_structure(G):
- Betweenness Centrality show the dependency of the network on a node
"""

# - Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
closeness_centrality = nx.closeness_centrality(G)

"""
- Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
- Nodes with high closeness centrality are important in the network
Examples: 4 nodes are connected
0
/ | \
2--1--3
- Here, node 0, 1 (1.0) has the highest closeness centrality because it is connected to all other nodes (node 2, 3 = 0.75)
- Closeness Centrality show the average distance of a node to all other nodes in the network
"""

# - Eigenvector Centrality: Measures influence of a node in a network

# eigenvector centrality measures the influence of a node in a network
eigenvector_centrality = nx.eigenvector_centrality(G)

"""
Expand All @@ -92,95 +92,31 @@ def analyze_graph_structure(G):
"""

# Community Structure
# - Louvain Algorithm (for community detection)
communities = list(nx.community.greedy_modularity_communities(G))
community_sizes = [len(community) for community in communities]
num_communities = len(communities)
# Communities can reveal modular structures in the graph
"""
- Community Detection: Identifying groups of nodes that are more connected to each other than to the rest of the network
- Communities can reveal modular structures in the graph
- Communities can be used to identify groups of nodes that are more connected to each other than to the rest of the network
Examples: 7 nodes are connected
1
/ \
2-----3
\ / 5
4-----/ \
6-----7
# - Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
closeness_centrality = nx.closeness_centrality(G)

- Here, nodes 1, 2, 3, 4 are in one community and nodes 5, 6, 7 are in another community
"""
- Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
- Nodes with high closeness centrality are important in the network
# Graph Connectivity
# - Check if the graph is connected
is_connected = nx.is_connected(G)
# - Calculate diameter: Longest shortest path between any two nodes
diameter = nx.diameter(G) if is_connected else float('inf')
# - Average shortest path length: Average of all shortest paths in the graph
average_shortest_path_length = nx.average_shortest_path_length(G) if is_connected else float('inf')

# Clustering Coefficient
# - Measures the degree to which nodes tend to cluster together
average_clustering_coefficient = nx.average_clustering(G)

# Assortativity
# - Measures the similarity of connections in the graph with respect to node degree
assortativity = nx.degree_assortativity_coefficient(G)

# Graph Diameter and Radius
# - Diameter: Longest shortest path in the graph
# - Radius: Minimum eccentricity of any node
radius = nx.radius(G) if is_connected else float('inf')

# Graph Transitivity
# - Measures the overall probability for the network to have adjacent nodes interconnected
transitivity = nx.transitivity(G)

# Return a dictionary containing the structural information
graph_info = {
"num_nodes": num_nodes,
"num_edges": num_edges,
"density": density,
"average_degree": average_degree,
"degree_distribution": degree_distribution,
"degree_centrality": degree_centrality,
"betweenness_centrality": betweenness_centrality,
"closeness_centrality": closeness_centrality,
"eigenvector_centrality": eigenvector_centrality,
"num_communities": num_communities,
"community_sizes": community_sizes,
"is_connected": is_connected,
"diameter": diameter,
"average_shortest_path_length": average_shortest_path_length,
"average_clustering_coefficient": average_clustering_coefficient,
"assortativity": assortativity,
"radius": radius,
"transitivity": transitivity
}

return graph_info


def print_graph_info(graph_info):
"""Prints the graph information in a formatted and readable way.
Examples: 4 nodes are connected
0
/ | \
2--1--3
Args:
graph_info: A dictionary containing information about the graph's structure.
- Here, node 0, 1 (1.0) has the highest closeness centrality because it is connected to all other nodes (node 2, 3 = 0.75)
- Closeness Centrality show the average distance of a node to all other nodes in the network
"""
n = 20 # Number of top nodes to return
# Calculate centrality measures
degree_centrality = get_top_n_central_nodes(nx.degree_centrality(G), n)
betweenness_centrality = get_top_n_central_nodes(nx.betweenness_centrality(G), n)
eigenvector_centrality = get_top_n_central_nodes(nx.eigenvector_centrality(G), n)
closeness_centrality = get_top_n_central_nodes(nx.closeness_centrality(G), n)

print(json.dumps(graph_info, indent=4))

# Find intersection of top nodes from all measures (set intersection)
all_centrality_nodes = set(degree_centrality) & set(betweenness_centrality) & set(eigenvector_centrality) & set(closeness_centrality)

graph_directory = os.fsencode("../.media/graphs/")
top_nodes = list(all_centrality_nodes)[:6]

with os.scandir("./Project/backend/codebase/.media/graphs/") as it:
for entry in it:
if entry.name.endswith(".gml") and entry.is_file():
print("-----------------------")
print(f"Filename: {entry.name}")
graph = nx.read_gml(entry.path)
graph_info = analyze_graph_structure(graph)
print_graph_info(graph_info)
return top_nodes
Loading

0 comments on commit d1e3481

Please sign in to comment.