Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor/run linting and fix any errors #158

Merged
merged 8 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions Project/backend/codebase/graph_analysis/graph_analysis.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import networkx as nx
import os
import json
import os

import networkx as nx


def analyze_graph_structure(G):
Expand Down Expand Up @@ -118,9 +119,11 @@ def analyze_graph_structure(G):
# - Check if the graph is connected
is_connected = nx.is_connected(G)
# - Calculate diameter: Longest shortest path between any two nodes
diameter = nx.diameter(G) if is_connected else float('inf')
diameter = nx.diameter(G) if is_connected else float("inf")
# - Average shortest path length: Average of all shortest paths in the graph
average_shortest_path_length = nx.average_shortest_path_length(G) if is_connected else float('inf')
average_shortest_path_length = (
nx.average_shortest_path_length(G) if is_connected else float("inf")
)

# Clustering Coefficient
# - Measures the degree to which nodes tend to cluster together
Expand All @@ -133,7 +136,7 @@ def analyze_graph_structure(G):
# Graph Diameter and Radius
# - Diameter: Longest shortest path in the graph
# - Radius: Minimum eccentricity of any node
radius = nx.radius(G) if is_connected else float('inf')
radius = nx.radius(G) if is_connected else float("inf")

# Graph Transitivity
# - Measures the overall probability for the network to have adjacent nodes interconnected
Expand All @@ -158,7 +161,7 @@ def analyze_graph_structure(G):
"average_clustering_coefficient": average_clustering_coefficient,
"assortativity": assortativity,
"radius": radius,
"transitivity": transitivity
"transitivity": transitivity,
}

return graph_info
Expand Down
17 changes: 11 additions & 6 deletions Project/backend/codebase/graph_creator/gemini.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
from datetime import datetime

import google.generativeai as genai

from graph_creator.services.json_handler import transform_llm_output_to_dict


Expand Down Expand Up @@ -69,16 +71,19 @@ def extract_entities_and_relations(chunk, genai_client):


def check_for_connecting_relation(
chunk, entities_component_1, entities_component_2, genai_client
chunk, entities_component_1, entities_component_2, genai_client
):
"""
Check for connecting relation between entities of two components.
"""
SYS_PROMPT = (
"Only answer in JSON format. \n"
"Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 with any entity of list_2.\n"
"We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk (delimited by ```)."
"For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and list_2:\n"
"Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 "
"with any entity of list_2.\n "
"We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk ("
"delimited by ```). "
"For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and "
"list_2:\n "
f"list_1: {entities_component_1}\n"
f"list_2: {entities_component_2}\n"
"Only use the exact entities given in the lists."
Expand All @@ -99,7 +104,7 @@ def check_for_connecting_relation(


def check_for_connecting_relation_(
text_chunk, entities_component_1, entities_component_2
text_chunk, entities_component_1, entities_component_2
):
"""
Takes a text chunk, and two lists of entities (from each component in the graph)
Expand All @@ -112,7 +117,7 @@ def check_for_connecting_relation_(
The text chunk to be proccessed
entities_component_1 : list
List of entities
entities_component_1 : list
entities_component_2 : list
List of entities

Returns
Expand Down
12 changes: 7 additions & 5 deletions Project/backend/codebase/graph_creator/graph_creator_main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import logging
import mimetypes

from graph_creator import graph_handler
from graph_creator import pdf_handler
from graph_creator.llama3 import process_chunks as groq_process_chunks
from graph_creator.models.graph_job import GraphJob
from graph_creator import pdf_handler
from graph_creator import graph_handler
from graph_creator.services import netx_graphdb

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def process_file_to_graph(g_job: GraphJob):
"""
Expand Down Expand Up @@ -57,11 +61,9 @@ def process_file_to_entities_and_relations(file: str):
] # Assuming chunk has 'page_content' attribute

# Generate response using LLM
# response_json = process_chunks(text_chunks, prompt_template)
response_json = groq_process_chunks(text_chunks)
print(response_json)
except Exception as e:
print(e)
logging.error(e)
response_json = None

return response_json, chunks
Expand Down
33 changes: 20 additions & 13 deletions Project/backend/codebase/graph_creator/graph_handler.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
import pandas as pd
import re
import json
import logging
import re
import time

import pandas as pd

from graph_creator import llama3

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def build_flattened_dataframe(entities_and_relations):
"""
Flatten list of lists by adding chunk_id attribute convert to pandas dataframe

Parameters
----------
entity_and_relations : list
entities_and_relations : list
List of Lists of dictionaries

Returns
Expand Down Expand Up @@ -47,7 +53,7 @@ def connect_with_chunk_proximity(entity_and_relation_df):
pandas.dataframe
A table with given relations and chunk proximity relations between the nodes
"""
# seperate all nodes by chunk_id
# separate all nodes by chunk_id
df_by_chunk_id = pd.melt(
entity_and_relation_df,
id_vars=["chunk_id"],
Expand Down Expand Up @@ -116,7 +122,7 @@ def index_entity_relation_table(entity_and_relation_df, entities):
A List containing all relations as tuples of entity indexes
"""
entities_dict = {}
# for reproducable results
# for reproducible results
entities = sorted(entities)
for i in range(len(entities)):
entities_dict[entities[i]] = i
Expand Down Expand Up @@ -178,7 +184,7 @@ def extract_components(relations_list):
elif inserte["at"] >= 0:
components[inserte["at"]].append(inserte["new_node"])

# remove empty componente
# remove empty components
components.pop(len(components) - 1)

return components
Expand Down Expand Up @@ -242,7 +248,6 @@ def get_shared_chunks_by_component(component1, component2, entity_chunks_list):
chunk_entities = set(entity_chunks_list[keys[i]])
intersection_c1 = chunk_entities.intersection(entities_component_1)
intersection_c2 = chunk_entities.intersection(entities_component_2)
# print(f"{intersection_size_c1}, {intersection_size_c2}")
if len(intersection_c1) > 0 and len(intersection_c2) > 0:
shared_chunks.append(keys[i])
intersections[keys[i]] = {"c1": intersection_c1, "c2": intersection_c2}
Expand Down Expand Up @@ -344,6 +349,9 @@ def connect_with_llm(data, text_chunks, rate_limit):
Table of nodes and relations between the nodes
text_chunks : list
A list of dictionaries containing the text chunks
rate_limit : int
The maximum number of requests that can be made to the LLM within a specified
timeframe.

Returns
-------
Expand All @@ -356,7 +364,7 @@ def connect_with_llm(data, text_chunks, rate_limit):
components = extract_components(relations_list)
number_components = len(components)

print("Before connecting {} components".format(number_components))
logger.info(f"Before connecting {number_components} components")

# get chunk information about contained entities
entity_chunks_list = get_entities_by_chunk(data, entities_dict)
Expand Down Expand Up @@ -408,18 +416,17 @@ def connect_with_llm(data, text_chunks, rate_limit):
relation = extract_relation_from_llm_output(
connecting_relation, main_chunk_entities, current_chunk_entities
)

# if relation is extracted than a valid relation containing only existing entities can be added
# print(relation)
if relation is not None:
relation["chunk_id"] = key_shared_chunk
connecting_relations.append(relation)
connections += 1
break

print(
"Made {} new connections and thereby reduced the graph to {} components".format(
connections, number_components - connections
)
logger.info(
f"Made {connections} new connections and thereby reduced the graph "
f"to {number_components - connections} components "
)
data = add_relations_to_data(data, connecting_relations)

Expand Down
3 changes: 2 additions & 1 deletion Project/backend/codebase/graph_creator/json_to_graphml.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import json
import logging

import networkx as nx
import pandas as pd
import logging


def json_string_to_graph(json_string):
Expand Down
18 changes: 11 additions & 7 deletions Project/backend/codebase/graph_creator/llama3.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from datetime import datetime

from groq import Groq

from graph_creator.services.json_handler import transform_llm_output_to_dict
Expand All @@ -9,7 +10,7 @@ def configure_groq():
"""
Ensure the API key is set in the environment
"""
# load_dotenv("Project/backend/.env", override=True)

api_key = os.getenv("GROQ_API_KEY")
if not api_key:
raise ValueError("API key not found in environment variables")
Expand Down Expand Up @@ -71,16 +72,19 @@ def extract_entities_and_relations(chunk, groq_client):


def check_for_connecting_relation(
chunk, entities_component_1, entities_component_2, groq_client
chunk, entities_component_1, entities_component_2, groq_client
):
"""
Check for connecting relation between entities of two components.
"""
SYS_PROMPT = (
"Only answer in JSON format. \n"
"Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 with any entity of list_2.\n"
"We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk (delimited by ```)."
"For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and list_2:\n"
"Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 "
"with any entity of list_2.\n "
"We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk ("
"delimited by ```). "
"For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and "
"list_2:\n "
f"list_1: {entities_component_1}\n"
f"list_2: {entities_component_2}\n"
"Only use the exact entities given in the lists."
Expand All @@ -103,7 +107,7 @@ def check_for_connecting_relation(


def check_for_connecting_relation_(
text_chunk, entities_component_1, entities_component_2
text_chunk, entities_component_1, entities_component_2
):
"""
Takes a text chunk, and two lists of entities (from each component in the graph)
Expand All @@ -116,7 +120,7 @@ def check_for_connecting_relation_(
The text chunk to be proccessed
entities_component_1 : list
List of entities
entities_component_1 : list
entities_component_2 : list
List of entities

Returns
Expand Down
5 changes: 3 additions & 2 deletions Project/backend/codebase/graph_creator/pdf_handler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


def process_pdf_into_chunks(filename):
Expand All @@ -10,7 +11,7 @@ def process_pdf_into_chunks(filename):
Parameters
----------
filename : str
The name of the pdf file to be proccessed
The name of the pdf file to be processed

Returns
-------
Expand Down
Loading
Loading