Merge branch 'develop' into feat/eliminate-duplicates

amosproj · Jul 3, 2024 · d1e3481 · d1e3481
2 parents 68033f1 + 3fdd460
commit d1e3481
Show file tree

Hide file tree

Showing 47 changed files with 3,083 additions and 1,087 deletions.
diff --git a/Documentation/llmExtractionMeasurments.txt b/Documentation/llmExtractionMeasurments.txt
@@ -0,0 +1,41 @@
+# Measurments of running Graph creator
+# india health article
+
+llama3, chunk_size=1500, chunk_overlap=150, 30 calls/min, 60 sec wait
+
+gemini-pro, chunk_size=1500, chunk_overlap=150, - calls/min, - sec wait
+
+gemini-flash, chunk_size=1500, chunk_overlap=150, - calls/min, - sec wait
+
+# Measurments as averages over all llm calls
+# two kinds of promts used: extract_entities_and_relations and check_for_connecting_relation
+
+
+# Execution speed of prompts by llm model
+
+gemini-flash: 12,75s (10 extraction requests) per request extracting, 1.57s connecting 
+
+gemini: 23,54s (10 extraction requests) per request extracting, 2,37s connecting
+
+groq+llama3: 0.72s (10 extraction requests) per request extracting, 0,48s connecting
+
+---------------------------------------
+
+# Statistics on the number of extracted entities by llm model
+
+llama: 3078 tokens / 1770 words -> 177 / 180 entities (34 / 47 connecting requests)
+
+gemini: 3078 tokens / 1770 words -> 303 / 316 entities (35 connecting requests)
+
+gemini-flash: 3078 tokens / 1770 words -> 309 / 369 entities (28 connecting requests)
+
+
+-----------------------------------------
+
+# Duration of knowledge graph extraction by llm model
+
+gemini-flash: 127,5s for entity extraction and 105s for connecting
+
+gemini: 212s for entity extraction and 189s for connecting
+
+groq+llama3: 7,9s for entity extraction and 136s for connecting
diff --git a/Project/backend/codebase/graph_analysis/graph_analysis.py b/Project/backend/codebase/graph_analysis/graph_analysis.py
@@ -2,6 +2,20 @@
 import os
 import json
 
+def get_top_n_central_nodes(centrality_dict, n):
+    """Sort nodes based on centrality measure and return top N nodes.
+    
+    Args:
+        centrality_dict: Dictionary of nodes with their centrality values.
+        n: Number of top nodes to return.
+    
+    Returns:
+        Sorted list of top N nodes with their centrality values.
+    """
+    # sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
+    # return sorted_nodes[:n]
+    sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
+    return [node for node, _ in sorted_nodes[:n]]
 
 def analyze_graph_structure(G):
     """Analyzes the structure of a knowledge graph and provides hopefully useful information.
@@ -17,15 +31,16 @@ def analyze_graph_structure(G):
     # Basic Graph Statistics
     num_nodes = G.number_of_nodes()  # Total number of nodes
     num_edges = G.number_of_edges()  # Total number of edges
-    density = nx.density(G)  # Ratio of actual edges to possible edges (0 to 1)
-    average_degree = 2 * num_edges / num_nodes  # Average number of edges per node
 
-    # Degree Distribution
+# Degree Distribution
     degree_distribution = dict(G.degree())
     # Degree distribution can indicate the presence of hubs or important nodes
 
-    degree_centrality = nx.degree_centrality(G)
+    if num_nodes == 0 or num_edges == 0:
+        raise ValueError("The graph is empty or not properly constructed.")
 
+    # Degree Centrality: Measures node connectivity
+    degree_centrality = nx.degree_centrality(G)
     """ Centrality Measures
     - Degree Centrality: Measures node connectivity
     - Nodes with high degree centrality are important in the network
@@ -36,9 +51,10 @@ def analyze_graph_structure(G):
     - Degree Centrality: node1 = 0.33(1/3), node2 = 0.66(2/3), node3 = 0.33(1/3)
     """
 
-    betweenness_centrality = nx.betweenness_centrality(G)
 
-    """ 
+    # Betweenness Centrality: Measures node's control over information flow
+    betweenness_centrality = nx.betweenness_centrality(G)
+    """
     - Betweenness Centrality: Measures node's control over information flow
     - Nodes with high betweenness centrality are important in the network
     
@@ -54,24 +70,8 @@ def analyze_graph_structure(G):
     - Betweenness Centrality show the dependency of the network on a node
 
     """
-
-    #  - Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
-    closeness_centrality = nx.closeness_centrality(G)
-
-    """
-    - Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
-    - Nodes with high closeness centrality are important in the network
-
-    Examples: 4 nodes are connected
-    0
- / | \
-2--1--3
-
-    - Here, node 0, 1 (1.0) has the highest closeness centrality because it is connected to all other nodes (node 2, 3 = 0.75)
-    - Closeness Centrality show the average distance of a node to all other nodes in the network
-    """
-
-    #  - Eigenvector Centrality: Measures influence of a node in a network
+
+    # eigenvector centrality measures the influence of a node in a network
     eigenvector_centrality = nx.eigenvector_centrality(G)
 
     """
@@ -92,95 +92,31 @@ def analyze_graph_structure(G):
 
     """
 
-    # Community Structure
-    #  - Louvain Algorithm (for community detection)
-    communities = list(nx.community.greedy_modularity_communities(G))
-    community_sizes = [len(community) for community in communities]
-    num_communities = len(communities)
-    # Communities can reveal modular structures in the graph
-    """
-    - Community Detection: Identifying groups of nodes that are more connected to each other than to the rest of the network
-    - Communities can reveal modular structures in the graph
-    - Communities can be used to identify groups of nodes that are more connected to each other than to the rest of the network
-
-    Examples: 7 nodes are connected
-    1
- /    \
-2-----3
- \   /     5
-   4-----/   \
-        6-----7
+    #  - Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
+    closeness_centrality = nx.closeness_centrality(G)
 
-    - Here, nodes 1, 2, 3, 4 are in one community and nodes 5, 6, 7 are in another community    
     """
+    - Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
+    - Nodes with high closeness centrality are important in the network
 
-    # Graph Connectivity
-    #  - Check if the graph is connected
-    is_connected = nx.is_connected(G)
-    #  - Calculate diameter: Longest shortest path between any two nodes
-    diameter = nx.diameter(G) if is_connected else float('inf')
-    #  - Average shortest path length: Average of all shortest paths in the graph
-    average_shortest_path_length = nx.average_shortest_path_length(G) if is_connected else float('inf')
-
-    # Clustering Coefficient
-    #  - Measures the degree to which nodes tend to cluster together
-    average_clustering_coefficient = nx.average_clustering(G)
-
-    # Assortativity
-    #  - Measures the similarity of connections in the graph with respect to node degree
-    assortativity = nx.degree_assortativity_coefficient(G)
-
-    # Graph Diameter and Radius
-    #  - Diameter: Longest shortest path in the graph
-    #  - Radius: Minimum eccentricity of any node
-    radius = nx.radius(G) if is_connected else float('inf')
-
-    # Graph Transitivity
-    #  - Measures the overall probability for the network to have adjacent nodes interconnected
-    transitivity = nx.transitivity(G)
-
-    # Return a dictionary containing the structural information
-    graph_info = {
-        "num_nodes": num_nodes,
-        "num_edges": num_edges,
-        "density": density,
-        "average_degree": average_degree,
-        "degree_distribution": degree_distribution,
-        "degree_centrality": degree_centrality,
-        "betweenness_centrality": betweenness_centrality,
-        "closeness_centrality": closeness_centrality,
-        "eigenvector_centrality": eigenvector_centrality,
-        "num_communities": num_communities,
-        "community_sizes": community_sizes,
-        "is_connected": is_connected,
-        "diameter": diameter,
-        "average_shortest_path_length": average_shortest_path_length,
-        "average_clustering_coefficient": average_clustering_coefficient,
-        "assortativity": assortativity,
-        "radius": radius,
-        "transitivity": transitivity
-    }
-
-    return graph_info
-
-
-def print_graph_info(graph_info):
-    """Prints the graph information in a formatted and readable way.
+    Examples: 4 nodes are connected
+    0
+ / | \
+2--1--3
 
-    Args:
-        graph_info: A dictionary containing information about the graph's structure.
+    - Here, node 0, 1 (1.0) has the highest closeness centrality because it is connected to all other nodes (node 2, 3 = 0.75)
+    - Closeness Centrality show the average distance of a node to all other nodes in the network
     """
+    n = 20  # Number of top nodes to return
+    # Calculate centrality measures
+    degree_centrality = get_top_n_central_nodes(nx.degree_centrality(G), n)
+    betweenness_centrality = get_top_n_central_nodes(nx.betweenness_centrality(G), n)
+    eigenvector_centrality = get_top_n_central_nodes(nx.eigenvector_centrality(G), n)
+    closeness_centrality = get_top_n_central_nodes(nx.closeness_centrality(G), n)
 
-    print(json.dumps(graph_info, indent=4))
-
+    # Find intersection of top nodes from all measures (set intersection)
+    all_centrality_nodes = set(degree_centrality) & set(betweenness_centrality) & set(eigenvector_centrality) & set(closeness_centrality)
 
-graph_directory = os.fsencode("../.media/graphs/")
+    top_nodes = list(all_centrality_nodes)[:6]
 
-with os.scandir("./Project/backend/codebase/.media/graphs/") as it:
-    for entry in it:
-        if entry.name.endswith(".gml") and entry.is_file():
-            print("-----------------------")
-            print(f"Filename: {entry.name}")
-            graph = nx.read_gml(entry.path)
-            graph_info = analyze_graph_structure(graph)
-            print_graph_info(graph_info)
+    return top_nodes