Skip to content

Commit 850aaf3

Browse files
Merge pull request #158 from amosproj/refactor/run-linting-and-fix-any-errors
Refactor/run linting and fix any errors
2 parents 584a69f + 1d8856a commit 850aaf3

File tree

20 files changed

+919
-305
lines changed

20 files changed

+919
-305
lines changed

Project/backend/codebase/graph_analysis/graph_analysis.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
import networkx as nx
2-
import os
31
import json
2+
import os
3+
4+
import networkx as nx
45

56

67
def analyze_graph_structure(G):
@@ -118,9 +119,11 @@ def analyze_graph_structure(G):
118119
# - Check if the graph is connected
119120
is_connected = nx.is_connected(G)
120121
# - Calculate diameter: Longest shortest path between any two nodes
121-
diameter = nx.diameter(G) if is_connected else float('inf')
122+
diameter = nx.diameter(G) if is_connected else float("inf")
122123
# - Average shortest path length: Average of all shortest paths in the graph
123-
average_shortest_path_length = nx.average_shortest_path_length(G) if is_connected else float('inf')
124+
average_shortest_path_length = (
125+
nx.average_shortest_path_length(G) if is_connected else float("inf")
126+
)
124127

125128
# Clustering Coefficient
126129
# - Measures the degree to which nodes tend to cluster together
@@ -133,7 +136,7 @@ def analyze_graph_structure(G):
133136
# Graph Diameter and Radius
134137
# - Diameter: Longest shortest path in the graph
135138
# - Radius: Minimum eccentricity of any node
136-
radius = nx.radius(G) if is_connected else float('inf')
139+
radius = nx.radius(G) if is_connected else float("inf")
137140

138141
# Graph Transitivity
139142
# - Measures the overall probability for the network to have adjacent nodes interconnected
@@ -158,7 +161,7 @@ def analyze_graph_structure(G):
158161
"average_clustering_coefficient": average_clustering_coefficient,
159162
"assortativity": assortativity,
160163
"radius": radius,
161-
"transitivity": transitivity
164+
"transitivity": transitivity,
162165
}
163166

164167
return graph_info

Project/backend/codebase/graph_creator/gemini.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import os
22
from datetime import datetime
3+
34
import google.generativeai as genai
5+
46
from graph_creator.services.json_handler import transform_llm_output_to_dict
57

68

@@ -69,16 +71,19 @@ def extract_entities_and_relations(chunk, genai_client):
6971

7072

7173
def check_for_connecting_relation(
72-
chunk, entities_component_1, entities_component_2, genai_client
74+
chunk, entities_component_1, entities_component_2, genai_client
7375
):
7476
"""
7577
Check for connecting relation between entities of two components.
7678
"""
7779
SYS_PROMPT = (
7880
"Only answer in JSON format. \n"
79-
"Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 with any entity of list_2.\n"
80-
"We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk (delimited by ```)."
81-
"For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and list_2:\n"
81+
"Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 "
82+
"with any entity of list_2.\n "
83+
"We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk ("
84+
"delimited by ```). "
85+
"For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and "
86+
"list_2:\n "
8287
f"list_1: {entities_component_1}\n"
8388
f"list_2: {entities_component_2}\n"
8489
"Only use the exact entities given in the lists."
@@ -99,7 +104,7 @@ def check_for_connecting_relation(
99104

100105

101106
def check_for_connecting_relation_(
102-
text_chunk, entities_component_1, entities_component_2
107+
text_chunk, entities_component_1, entities_component_2
103108
):
104109
"""
105110
Takes a text chunk, and two lists of entities (from each component in the graph)
@@ -112,7 +117,7 @@ def check_for_connecting_relation_(
112117
The text chunk to be proccessed
113118
entities_component_1 : list
114119
List of entities
115-
entities_component_1 : list
120+
entities_component_2 : list
116121
List of entities
117122
118123
Returns

Project/backend/codebase/graph_creator/graph_creator_main.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
1+
import logging
12
import mimetypes
23

4+
from graph_creator import graph_handler
5+
from graph_creator import pdf_handler
36
from graph_creator.llama3 import process_chunks as groq_process_chunks
47
from graph_creator.models.graph_job import GraphJob
5-
from graph_creator import pdf_handler
6-
from graph_creator import graph_handler
78
from graph_creator.services import netx_graphdb
89

10+
logging.basicConfig(level=logging.INFO)
11+
logger = logging.getLogger(__name__)
12+
913

1014
def process_file_to_graph(g_job: GraphJob):
1115
"""
@@ -57,11 +61,9 @@ def process_file_to_entities_and_relations(file: str):
5761
] # Assuming chunk has 'page_content' attribute
5862

5963
# Generate response using LLM
60-
# response_json = process_chunks(text_chunks, prompt_template)
6164
response_json = groq_process_chunks(text_chunks)
62-
print(response_json)
6365
except Exception as e:
64-
print(e)
66+
logging.error(e)
6567
response_json = None
6668

6769
return response_json, chunks

Project/backend/codebase/graph_creator/graph_handler.py

+20-13
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,23 @@
1-
import pandas as pd
2-
import re
31
import json
2+
import logging
3+
import re
44
import time
5+
6+
import pandas as pd
7+
58
from graph_creator import llama3
69

10+
logging.basicConfig(level=logging.INFO)
11+
logger = logging.getLogger(__name__)
12+
713

814
def build_flattened_dataframe(entities_and_relations):
915
"""
1016
Flatten list of lists by adding chunk_id attribute convert to pandas dataframe
1117
1218
Parameters
1319
----------
14-
entity_and_relations : list
20+
entities_and_relations : list
1521
List of Lists of dictionaries
1622
1723
Returns
@@ -47,7 +53,7 @@ def connect_with_chunk_proximity(entity_and_relation_df):
4753
pandas.dataframe
4854
A table with given relations and chunk proximity relations between the nodes
4955
"""
50-
# seperate all nodes by chunk_id
56+
# separate all nodes by chunk_id
5157
df_by_chunk_id = pd.melt(
5258
entity_and_relation_df,
5359
id_vars=["chunk_id"],
@@ -116,7 +122,7 @@ def index_entity_relation_table(entity_and_relation_df, entities):
116122
A List containing all relations as tuples of entity indexes
117123
"""
118124
entities_dict = {}
119-
# for reproducable results
125+
# for reproducible results
120126
entities = sorted(entities)
121127
for i in range(len(entities)):
122128
entities_dict[entities[i]] = i
@@ -178,7 +184,7 @@ def extract_components(relations_list):
178184
elif inserte["at"] >= 0:
179185
components[inserte["at"]].append(inserte["new_node"])
180186

181-
# remove empty componente
187+
# remove empty components
182188
components.pop(len(components) - 1)
183189

184190
return components
@@ -242,7 +248,6 @@ def get_shared_chunks_by_component(component1, component2, entity_chunks_list):
242248
chunk_entities = set(entity_chunks_list[keys[i]])
243249
intersection_c1 = chunk_entities.intersection(entities_component_1)
244250
intersection_c2 = chunk_entities.intersection(entities_component_2)
245-
# print(f"{intersection_size_c1}, {intersection_size_c2}")
246251
if len(intersection_c1) > 0 and len(intersection_c2) > 0:
247252
shared_chunks.append(keys[i])
248253
intersections[keys[i]] = {"c1": intersection_c1, "c2": intersection_c2}
@@ -344,6 +349,9 @@ def connect_with_llm(data, text_chunks, rate_limit):
344349
Table of nodes and relations between the nodes
345350
text_chunks : list
346351
A list of dictionaries containing the text chunks
352+
rate_limit : int
353+
The maximum number of requests that can be made to the LLM within a specified
354+
timeframe.
347355
348356
Returns
349357
-------
@@ -356,7 +364,7 @@ def connect_with_llm(data, text_chunks, rate_limit):
356364
components = extract_components(relations_list)
357365
number_components = len(components)
358366

359-
print("Before connecting {} components".format(number_components))
367+
logger.info(f"Before connecting {number_components} components")
360368

361369
# get chunk information about contained entities
362370
entity_chunks_list = get_entities_by_chunk(data, entities_dict)
@@ -408,18 +416,17 @@ def connect_with_llm(data, text_chunks, rate_limit):
408416
relation = extract_relation_from_llm_output(
409417
connecting_relation, main_chunk_entities, current_chunk_entities
410418
)
419+
411420
# if relation is extracted than a valid relation containing only existing entities can be added
412-
# print(relation)
413421
if relation is not None:
414422
relation["chunk_id"] = key_shared_chunk
415423
connecting_relations.append(relation)
416424
connections += 1
417425
break
418426

419-
print(
420-
"Made {} new connections and thereby reduced the graph to {} components".format(
421-
connections, number_components - connections
422-
)
427+
logger.info(
428+
f"Made {connections} new connections and thereby reduced the graph "
429+
f"to {number_components - connections} components "
423430
)
424431
data = add_relations_to_data(data, connecting_relations)
425432

Project/backend/codebase/graph_creator/json_to_graphml.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import json
2+
import logging
3+
24
import networkx as nx
35
import pandas as pd
4-
import logging
56

67

78
def json_string_to_graph(json_string):

Project/backend/codebase/graph_creator/llama3.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
from datetime import datetime
3+
34
from groq import Groq
45

56
from graph_creator.services.json_handler import transform_llm_output_to_dict
@@ -9,7 +10,7 @@ def configure_groq():
910
"""
1011
Ensure the API key is set in the environment
1112
"""
12-
# load_dotenv("Project/backend/.env", override=True)
13+
1314
api_key = os.getenv("GROQ_API_KEY")
1415
if not api_key:
1516
raise ValueError("API key not found in environment variables")
@@ -71,16 +72,19 @@ def extract_entities_and_relations(chunk, groq_client):
7172

7273

7374
def check_for_connecting_relation(
74-
chunk, entities_component_1, entities_component_2, groq_client
75+
chunk, entities_component_1, entities_component_2, groq_client
7576
):
7677
"""
7778
Check for connecting relation between entities of two components.
7879
"""
7980
SYS_PROMPT = (
8081
"Only answer in JSON format. \n"
81-
"Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 with any entity of list_2.\n"
82-
"We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk (delimited by ```)."
83-
"For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and list_2:\n"
82+
"Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 "
83+
"with any entity of list_2.\n "
84+
"We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk ("
85+
"delimited by ```). "
86+
"For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and "
87+
"list_2:\n "
8488
f"list_1: {entities_component_1}\n"
8589
f"list_2: {entities_component_2}\n"
8690
"Only use the exact entities given in the lists."
@@ -103,7 +107,7 @@ def check_for_connecting_relation(
103107

104108

105109
def check_for_connecting_relation_(
106-
text_chunk, entities_component_1, entities_component_2
110+
text_chunk, entities_component_1, entities_component_2
107111
):
108112
"""
109113
Takes a text chunk, and two lists of entities (from each component in the graph)
@@ -116,7 +120,7 @@ def check_for_connecting_relation_(
116120
The text chunk to be proccessed
117121
entities_component_1 : list
118122
List of entities
119-
entities_component_1 : list
123+
entities_component_2 : list
120124
List of entities
121125
122126
Returns

Project/backend/codebase/graph_creator/pdf_handler.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
2-
from langchain_text_splitters import RecursiveCharacterTextSplitter
2+
33
from langchain_community.document_loaders import PyPDFLoader
4+
from langchain_text_splitters import RecursiveCharacterTextSplitter
45

56

67
def process_pdf_into_chunks(filename):
@@ -10,7 +11,7 @@ def process_pdf_into_chunks(filename):
1011
Parameters
1112
----------
1213
filename : str
13-
The name of the pdf file to be proccessed
14+
The name of the pdf file to be processed
1415
1516
Returns
1617
-------

0 commit comments

Comments
 (0)