Skip to content

Commit

Permalink
Merge pull request #210 from amosproj/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
get4flo authored Jul 17, 2024
2 parents 9b35018 + 3247592 commit d6083c3
Show file tree
Hide file tree
Showing 35 changed files with 1,034 additions and 1,892 deletions.
2 changes: 1 addition & 1 deletion Project/backend/codebase/graph_analysis/graph_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def analyze_graph_structure(G):
- Here, node 0, 1 (1.0) has the highest closeness centrality because it is connected to all other nodes (node 2, 3 = 0.75)
- Closeness Centrality show the average distance of a node to all other nodes in the network
"""
n = 20 # Number of top nodes to return
n = 20 if num_nodes > 20 else 5 # Number of top nodes to return
# Calculate centrality measures
degree_centrality = get_top_n_central_nodes(nx.degree_centrality(G), n)
betweenness_centrality = get_top_n_central_nodes(nx.betweenness_centrality(G), n)
Expand Down
479 changes: 198 additions & 281 deletions Project/backend/codebase/graph_creator/embedding_handler.py

Large diffs are not rendered by default.

9 changes: 8 additions & 1 deletion Project/backend/codebase/graph_creator/graph_creator_main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging

from graph_creator.embedding_handler import embeddings_handler
from graph_creator import graph_handler
import os
from graph_creator.services.llm.llama_gemini_combination import llama_gemini_combination
from graph_creator.models.graph_job import GraphJob
from graph_creator.services import netx_graphdb
Expand Down Expand Up @@ -88,6 +89,12 @@ def create_and_store_graph(uuid, entities_and_relations, chunks, llm_handler):
chunks[i] = chunks[i].dict()
combined = graph_handler.connect_with_llm(df_e_and_r, chunks, llm_handler)

# Create an instance of the embeddings handler
embeddings_handler_instance = embeddings_handler(GraphJob(id=uuid))

# Generate embeddings and merge duplicates
combined = embeddings_handler_instance.generate_embeddings_and_merge_duplicates(combined)

# get graph db service
graph_db_service = netx_graphdb.NetXGraphDB()

Expand Down
27 changes: 16 additions & 11 deletions Project/backend/codebase/graph_creator/graph_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ def index_entity_relation_table(entity_and_relation_df, entities):
entities_dict[entities[i]] = i

relations = []
for i, row in entity_and_relation_df.iterrows():
entity_and_relation_df_withoutna = entity_and_relation_df.dropna()
for i, row in entity_and_relation_df_withoutna.iterrows():
relations.append([entities_dict[row["node_1"]], entities_dict[row["node_2"]]])

return entities_dict, relations
Expand Down Expand Up @@ -213,7 +214,8 @@ def get_entities_by_chunk(entity_and_relation_df, entities_dict):
A dictionary containing all entities per chunk as ids
"""
entities_by_chunk = {}
for i, row in entity_and_relation_df.iterrows():
entity_and_relation_df_withoutna = entity_and_relation_df.dropna()
for i, row in entity_and_relation_df_withoutna.iterrows():
if row["chunk_id"] in entities_by_chunk:
entities_by_chunk[row["chunk_id"]].append(entities_dict[row["node_1"]])
entities_by_chunk[row["chunk_id"]].append(entities_dict[row["node_2"]])
Expand Down Expand Up @@ -333,15 +335,18 @@ def add_relations_to_data(entity_and_relation_df, new_relations):
"""
for relation in new_relations:
node_1 = relation["node_1"]
node_2 = relation["node_2"]
edge = relation["edge"]
chunk_id = relation["chunk_id"]

pos = len(entity_and_relation_df.index)
entity_and_relation_df.loc[pos] = [node_1, node_2, edge, chunk_id]

return entity_and_relation_df
try:
node_1 = relation["node_1"]
node_2 = relation["node_2"]
edge = relation["edge"]
chunk_id = relation["chunk_id"]

pos = len(entity_and_relation_df.index)
entity_and_relation_df.loc[pos] = [node_1, node_2, edge, chunk_id]
except ValueError:
print(f"Error in add_relations_to_data: ,", node_1, node_2, edge, chunk_id)
pass
return entity_and_relation_df.dropna()


def add_topic(data: pd.DataFrame, max_topics: int = 25) -> pd.DataFrame:
Expand Down
57 changes: 57 additions & 0 deletions Project/backend/codebase/graph_creator/router.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
import os
import uuid
Expand All @@ -7,6 +8,8 @@
from fastapi import UploadFile, File, HTTPException
from starlette.responses import JSONResponse

from graph_creator.embedding_handler import embeddings_handler
from graph_creator.schemas.graph_query import QueryRequest
import graph_creator.graph_creator_main as graph_creator_main
from graph_creator.dao.graph_job_dao import GraphJobDAO
from graph_creator.schemas.graph_job import GraphJobCreate
Expand Down Expand Up @@ -193,6 +196,9 @@ async def delete_graph_job(
graph_job_id = graph_job.id
await graph_job_dao.delete_graph_job(graph_job)
netx_services.delete_graph(graph_job_id)
graphEmbeddingsHandler = embeddings_handler(graph_job, lazyLoad=True)
graphEmbeddingsHandler.delete_embeddings()



@router.post("/create_graph/{graph_job_id}")
Expand Down Expand Up @@ -298,3 +304,54 @@ async def query_graph(
graph = netx_services.load_graph(graph_job_id=graph_job_id)
graph_keywords = analyze_graph_structure(graph)
return graph_keywords


@router.post("/graph_search/{graph_job_id}")
async def query_graph(
graph_job_id: uuid.UUID,
request: QueryRequest,
graph_job_dao: GraphJobDAO = Depends(),
):
"""
Reads a graph job by id and tries to answer a query about the graph using embeddings
Args:
graph_job_id (uuid.UUID): ID of the graph job to be read.
request (QueryRequest): contains user query
graph_job_dao (GraphJobDAO): graph job database access object
Returns:
Answer to question from the user regarding the graph
Raises:
HTTPException: If there is no graph job with the given ID.
"""

g_job = await graph_job_dao.get_graph_job_by_id(graph_job_id)

if not g_job:
raise HTTPException(status_code=404, detail="Graph job not found")
if g_job.status != GraphStatus.GRAPH_READY:
raise HTTPException(
status_code=400,
detail="No graph created for this job!",
)

user_query = request.query
#print(f"Received query: {user_query}")

graphEmbeddingsHandler = embeddings_handler(g_job)

if graphEmbeddingsHandler.is_embedded():
#do search
result = graphEmbeddingsHandler.search_graph(user_query, k=4)
#print(result)
answer = json.dumps(result)
else:
#can't answer because no embeddings exist
answer = 'No embeddings found'

return JSONResponse(
content={"answer": answer},
status_code=200,
)
4 changes: 4 additions & 0 deletions Project/backend/codebase/graph_creator/schemas/graph_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from pydantic import BaseModel

class QueryRequest(BaseModel):
query: str
52 changes: 39 additions & 13 deletions Project/backend/codebase/graph_creator/services/file_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
import os
from pathlib import Path

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain_core.documents import Document

from langchain_text_splitters import (
RecursiveCharacterTextSplitter,
Expand All @@ -18,43 +19,68 @@ class FileHandler:
def __init__(self, file_location: str):
self.file_location = file_location
self.file_loader = {
".pdf": PyPDFLoader,
".txt": TextLoader,
".docx": Docx2txtLoader,
".pptx": UnstructuredPowerPointLoader,
".json": RecursiveJsonSplitter,
".pdf": (PyPDFLoader, {}),
".txt": (TextLoader, {}),
".docx": (Docx2txtLoader, {}),
".pptx": (
UnstructuredPowerPointLoader,
{"mode": "elements", "strategy": "fast", "join_docs_by_page": True}
),
".json": (RecursiveJsonSplitter, {}),
}

if not os.path.isfile(self.file_location):
raise ValueError("Invalid file path.")

def process_file_into_chunks(self):
file_loader = self._get_file_loader()
file_loader, kwargs = self._get_file_loader()
if file_loader == RecursiveJsonSplitter:
return self._get_json_chunks()
loader = file_loader(self.file_location)
join_docs_by_page = kwargs.pop("join_docs_by_page", False)
loader = file_loader(self.file_location, **kwargs)
docs = loader.load()
splits = self._process_doc_to_chunks(docs)
splits = self._process_doc_to_chunks(docs, join_docs_by_page=join_docs_by_page)
return splits

@staticmethod
def _process_doc_to_chunks(docs):
def _process_doc_to_chunks(docs, join_docs_by_page: bool):
if not docs:
raise ValueError("Failed to load documents.")

if join_docs_by_page:
new_docs = []
current_doc = Document(page_content="")
current_page = None
new_docs.append(current_doc)
for doc in docs:
if doc.page_content == "":
continue
doc_current_page = doc.metadata.get("page_number", None)
# if doc_current_page is None
if current_page != doc_current_page and doc.metadata.get("category", None) not in ["PageBreak", None]:
current_doc = Document(
page_content=doc.page_content,
metadata={"page": doc_current_page - 1 if doc_current_page else "No page"}
)
current_page = doc_current_page
new_docs.append(current_doc)
else:
current_doc.page_content += f"\n {doc.page_content}"
else:
new_docs = docs
# splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page'])
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150
)
splits = text_splitter.split_documents(docs)
splits = text_splitter.split_documents(new_docs)
return splits

def _get_file_loader(self):
_, extension = os.path.splitext(self.file_location)
loader = self.file_loader.get(extension)
loader, kwargs = self.file_loader.get(extension)
if loader is None:
raise ValueError("File format does not have a loader!")
return loader
return loader, kwargs

def _get_json_chunks(self):
json_data = json.loads(Path(self.file_location).read_text())
Expand Down
19 changes: 0 additions & 19 deletions Project/frontend/src/components/App/index.css
Original file line number Diff line number Diff line change
Expand Up @@ -58,22 +58,3 @@ img {
justify-content: center;
gap: 10px;
}

.main_wrapper {
display: flex;
flex-direction: column;
align-items: center;
gap: 20px;
margin: 20px;
min-width: 100%;
min-height: 100%;
}

.Appcontainer {
display: flex;
flex-direction: column;
align-items: center;
gap: 20px;
min-width: 100%;
min-height: 100%;
}
5 changes: 1 addition & 4 deletions Project/frontend/src/components/App/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,11 @@ import {
Routes,
} from 'react-router-dom';
import {
AppBar,
createTheme,
CssBaseline,
Divider,
Paper,
Stack,
ThemeProvider,
Toolbar,
Typography,
} from '@mui/material';

Expand All @@ -37,7 +34,7 @@ function App() {
<Stack direction="column" flex={1}>
<Paper
variant="elevation"
elevation={0.7}
elevation={1}
component={Stack}
display={'flex'}
flexDirection={'row'}
Expand Down
Loading

0 comments on commit d6083c3

Please sign in to comment.