Skip to content
This repository was archived by the owner on Jul 17, 2025. It is now read-only.

Commit d725fb2

Browse files
committed
feat: Add CommaSeparatedStrList validation tests and update project versions
1 parent a55f762 commit d725fb2

22 files changed

+608
-107
lines changed

admin-api-lib/pyproject.toml

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
[build-system]
2+
requires = ["poetry-core"]
3+
build-backend = "poetry.core.masonry.api"
4+
5+
[tool.poetry]
6+
name = "admin-api-lib"
7+
version = "1.0.1"
8+
description = "The admin backend is responsible for the document management. This includes deletion, upload and returning the source document."
9+
authors = ["STACKIT Data and AI Consulting <[email protected]>"]
10+
packages = [{ include = "admin_api_lib", from = "src" }]
11+
112
[tool.flake8]
213
exclude= [".eggs", "./rag-core-library/*", "./src/admin_api_lib/models/*", "./src/admin_api_lib/rag_backend_client/*", "./src/admin_api_lib/extractor_api_client/*", ".git", ".hg", ".mypy_cache", ".tox", ".venv", ".devcontainer", "venv", "_build", "buck-out", "build", "dist", "**/__init__.py"]
314
statistics = true
@@ -51,12 +62,6 @@ known_local_folder = ["admin_api_lib", "rag_core_lib"]
5162
[tool.pylint]
5263
max-line-length = 120
5364

54-
[tool.poetry]
55-
name = "admin_api_lib"
56-
version = "0.0.1"
57-
description = "The admin backend is responsible for the document management. This includes deletion, upload and returning the source document."
58-
authors = ["STACKIT Data and AI Consulting <[email protected]>"]
59-
6065
[tool.poetry.group.dev.dependencies]
6166
debugpy = "^1.8.1"
6267
pytest = "^8.2.1"
@@ -85,11 +90,6 @@ black = "^23.9.1"
8590
# flake8-logging-format = "^2024.24.12"
8691
# flake8-docstrings = "^1.7.0"
8792

88-
89-
[build-system]
90-
requires = ["poetry-core"]
91-
build-backend = "poetry.core.masonry.api"
92-
9393
[tool.poetry.dependencies]
9494
rag-core-lib = {path = "../rag-core-lib"}
9595
python = "^3.11"
@@ -103,3 +103,9 @@ tqdm = "^4.66.4"
103103
langfuse = "^2.39.1"
104104
redis = "^5.0.8"
105105
pyyaml = "^6.0.2"
106+
107+
[tool.pytest.ini_options]
108+
log_cli = 1
109+
log_cli_level = "DEBUG"
110+
pythonpath = "src"
111+
testpaths = "src/tests"

admin-api-lib/src/admin_api_lib/dependency_container.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ class DependencyContainer(DeclarativeContainer):
148148
)
149149

150150
summary_enhancer = List(
151-
Singleton(PageSummaryEnhancer, summarizer),
151+
Singleton(PageSummaryEnhancer, summarizer, chunker_settings),
152152
)
153153
untraced_information_enhancer = Singleton(
154154
GeneralEnhancer,

admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py

Lines changed: 59 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
import logging
44
from asyncio import run
55
from threading import Thread
6+
import threading
67

78
from fastapi import HTTPException, status
9+
from langchain_core.documents import Document
810

911
from admin_api_lib.api_endpoints.confluence_loader import ConfluenceLoader
1012
from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter
@@ -100,10 +102,16 @@ async def aload_from_confluence(self) -> None:
100102
HTTPException
101103
If the Confluence loader is not configured or if a load is already in progress.
102104
"""
103-
if not (self._settings.url.strip() and self._settings.space_key.strip() and self._settings.token.strip()):
104-
raise HTTPException(
105-
status.HTTP_501_NOT_IMPLEMENTED, "The confluence loader is not configured! Required fields are missing."
106-
)
105+
for index in range(len(self._settings.url)):
106+
if not (
107+
self._settings.url[index].strip()
108+
and self._settings.space_key[index].strip()
109+
and self._settings.token[index].strip()
110+
):
111+
raise HTTPException(
112+
status.HTTP_501_NOT_IMPLEMENTED,
113+
"The confluence loader is not configured! Required fields are missing.",
114+
)
107115

108116
if self._background_thread is not None and self._background_thread.is_alive():
109117
raise HTTPException(
@@ -113,51 +121,69 @@ async def aload_from_confluence(self) -> None:
113121
self._background_thread.start()
114122

115123
async def _aload_from_confluence(self) -> None:
116-
params = self._settings_mapper.map_settings_to_params(self._settings)
117-
try:
118-
self._key_value_store.upsert(self._settings.document_name, Status.PROCESSING)
119-
information_pieces = self._extractor_api.extract_from_confluence_post(params)
120-
documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces]
121-
chunked_documents = self._chunker.chunk(documents)
122-
rag_information_pieces = [
123-
self._information_mapper.document2rag_information_piece(doc) for doc in chunked_documents
124-
]
125-
except Exception as e:
126-
self._key_value_store.upsert(self._settings.document_name, Status.ERROR)
127-
logger.error("Error while loading from Confluence: %s", str(e))
128-
raise HTTPException(
129-
status.HTTP_500_INTERNAL_SERVER_ERROR, f"Error loading from Confluence: {str(e)}"
130-
) from e
131-
132-
await self._delete_previous_information_pieces()
133-
self._key_value_store.upsert(self._settings.document_name, Status.UPLOADING)
134-
self._upload_information_pieces(rag_information_pieces)
135-
136-
async def _delete_previous_information_pieces(self):
124+
async def process_confluence(index):
125+
logger.info("Loading from Confluence %s", self._settings.url[index])
126+
self._sanitize_document_name(index=index)
127+
128+
params = self._settings_mapper.map_settings_to_params(self._settings, index)
129+
try:
130+
self._key_value_store.upsert(self._settings.document_name[index], Status.PROCESSING)
131+
information_pieces = self._extractor_api.extract_from_confluence_post(params)
132+
documents = [
133+
self._information_mapper.extractor_information_piece2document(x) for x in information_pieces
134+
]
135+
documents = await self._aenhance_langchain_documents(documents)
136+
chunked_documents = self._chunker.chunk(documents)
137+
rag_information_pieces = [
138+
self._information_mapper.document2rag_information_piece(doc) for doc in chunked_documents
139+
]
140+
except Exception as e:
141+
self._key_value_store.upsert(self._settings.document_name[index], Status.ERROR)
142+
143+
logger.error("Error while loading from Confluence: %s", str(e))
144+
raise HTTPException(
145+
status.HTTP_500_INTERNAL_SERVER_ERROR, f"Error loading from Confluence: {str(e)}"
146+
) from e
147+
148+
await self._delete_previous_information_pieces(index=index)
149+
self._key_value_store.upsert(self._settings.document_name[index], Status.UPLOADING)
150+
self._upload_information_pieces(rag_information_pieces, index=index)
151+
152+
threads = []
153+
for idx in range(len(self._settings.url)):
154+
t = threading.Thread(target=lambda idx=idx: run(process_confluence(idx)))
155+
threads.append(t)
156+
t.start()
157+
for t in threads:
158+
t.join()
159+
160+
async def _delete_previous_information_pieces(self, index=0):
137161
try:
138-
await self._document_deleter.adelete_document(self._settings.document_name)
162+
await self._document_deleter.adelete_document(self._settings.document_name[index])
139163
except HTTPException as e:
140164
logger.error(
141165
(
142166
"Error while trying to delete documents with id: %s before uploading %s."
143167
"NOTE: Still continuing with upload."
144168
),
145-
self._settings.document_name,
169+
self._settings.document_name[index],
146170
e,
147171
)
148172

149-
def _upload_information_pieces(self, rag_api_documents):
173+
def _upload_information_pieces(self, rag_api_documents, index=0):
150174
try:
151175
self._rag_api.upload_information_piece(rag_api_documents)
152-
self._key_value_store.upsert(self._settings.document_name, Status.READY)
176+
self._key_value_store.upsert(self._settings.document_name[index], Status.READY)
153177
logger.info("Confluence loaded successfully")
154178
except Exception as e:
155-
self._key_value_store.upsert(self._settings.document_name, Status.ERROR)
179+
self._key_value_store.upsert(self._settings.document_name[index], Status.ERROR)
156180
logger.error("Error while uploading Confluence to the database: %s", str(e))
157181
raise HTTPException(500, f"Error loading from Confluence: {str(e)}") from e
158182

159-
def _sanitize_document_name(self) -> None:
160-
document_name = self._settings.document_name if self._settings.document_name else self._settings.url
183+
def _sanitize_document_name(self, index) -> None:
184+
document_name = (
185+
self._settings.document_name[index] if self._settings.document_name[index] else self._settings.url[index]
186+
)
161187
document_name = document_name.replace("http://", "").replace("https://", "")
162188

163-
self._settings.document_name = sanitize_document_name(document_name)
189+
self._settings.document_name[index] = sanitize_document_name(document_name)

admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,15 @@ class PageSummaryEnhancer(SummaryEnhancer):
2929

3030
async def _acreate_summary(self, information: list[Document], config: Optional[RunnableConfig]) -> list[Document]:
3131
# group infos by page, defaulting to page 1 if no page metadata
32+
if self._chunker_settings:
33+
filtered_information = [
34+
info for info in information if len(info.page_content) > self._chunker_settings.max_size
35+
]
36+
else:
37+
filtered_information = information
3238
grouped = [
33-
[info for info in information if info.metadata.get("page", self.DEFAULT_PAGE_NR) == page]
34-
for page in {info_piece.metadata.get("page", self.DEFAULT_PAGE_NR) for info_piece in information}
39+
[info for info in filtered_information if info.metadata.get("page", self.DEFAULT_PAGE_NR) == page]
40+
for page in {info_piece.metadata.get("page", self.DEFAULT_PAGE_NR) for info_piece in filtered_information}
3541
]
3642

3743
summary_tasks = [self._asummarize_page(info_group, config) for info_group in tqdm(grouped)]

admin-api-lib/src/admin_api_lib/impl/information_enhancer/summary_enhancer.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from abc import abstractmethod
44
from typing import Optional
55

6+
from admin_api_lib.impl.settings.chunker_settings import ChunkerSettings
67
from langchain_core.documents import Document
78
from langchain_core.runnables import RunnableConfig, ensure_config
89

@@ -26,7 +27,7 @@ class SummaryEnhancer(InformationEnhancer):
2627

2728
INFORMATION_METADATA_TYPE = "type"
2829

29-
def __init__(self, summarizer: Summarizer):
30+
def __init__(self, summarizer: Summarizer, chunker_settings: ChunkerSettings = None):
3031
"""
3132
Initialize the SummaryEnhancer with a given Summarizer instance.
3233
@@ -37,6 +38,7 @@ def __init__(self, summarizer: Summarizer):
3738
"""
3839
super().__init__()
3940
self._summarizer = summarizer
41+
self._chunker_settings = chunker_settings
4042

4143
@staticmethod
4244
def _is_relevant(information: Document) -> bool:

admin-api-lib/src/admin_api_lib/impl/key_db/file_status_key_value_store.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ def upsert(self, file_name: str, file_status: Status) -> None:
7676
None
7777
"""
7878
self.remove(file_name)
79-
8079
self._redis.sadd(self.STORAGE_KEY, FileStatusKeyValueStore._to_str(file_name, file_status))
8180

8281
def remove(self, file_name: str) -> None:

admin-api-lib/src/admin_api_lib/impl/mapper/confluence_settings_mapper.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ class ConfluenceSettingsMapper:
1010
"""Mapper class for converting ConfluenceSettings to ConfluenceParameters."""
1111

1212
@staticmethod
13-
def map_settings_to_params(settings: ConfluenceSettings) -> ConfluenceParameters:
13+
def map_settings_to_params(settings: ConfluenceSettings, index) -> ConfluenceParameters:
1414
"""
1515
Map ConfluenceSettings to ConfluenceParameters.
1616
@@ -25,11 +25,12 @@ def map_settings_to_params(settings: ConfluenceSettings) -> ConfluenceParameters
2525
The parameters object for API consumption.
2626
"""
2727
return ConfluenceParameters(
28-
url=settings.url,
29-
token=settings.token,
30-
space_key=settings.space_key,
31-
include_attachments=settings.include_attachments,
32-
keep_markdown_format=settings.keep_markdown_format,
33-
keep_newlines=settings.keep_newlines,
34-
document_name=settings.document_name,
28+
url=settings.url[index],
29+
token=settings.token[index],
30+
space_key=settings.space_key[index],
31+
include_attachments=settings.include_attachments[index],
32+
keep_markdown_format=settings.keep_markdown_format[index],
33+
keep_newlines=settings.keep_newlines[index],
34+
document_name=settings.document_name[index],
35+
confluence_kwargs=[{"key": "verify_ssl", "value": settings.verify_ssl[index]}],
3536
)

0 commit comments

Comments
 (0)