danny-avila
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 1 deletion b/‎README.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎app/config.py‎
Lines changed: 13 additions & 0 deletions b/‎app/config.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎app/routes/document_routes.py‎
Lines changed: 172 additions & 57 deletions b/‎app/routes/document_routes.py‎
Lines changed: 172 additions & 57 deletions
@@ -1,3 +1,4 @@
+.git
 .idea
 .venv
 .env
 
@@ -64,7 +64,7 @@ The following environment variables are required to run the application:
 - `DEBUG_RAG_API`: (Optional) Set to "True" to show more verbose logging output in the server console, and to enable postgresql database routes
 - `DEBUG_PGVECTOR_QUERIES`: (Optional) Set to "True" to enable detailed PostgreSQL query logging for pgvector operations. Useful for debugging performance issues with vector database queries.
 - `CONSOLE_JSON`: (Optional) Set to "True" to log as json for Cloud Logging aggregations
-- `EMBEDDINGS_PROVIDER`: (Optional) either "openai", "bedrock", "azure", "huggingface", "huggingfacetei", "vertexai", or "ollama", where "huggingface" uses sentence_transformers; defaults to "openai"
+- `EMBEDDINGS_PROVIDER`: (Optional) either "openai", "bedrock", "azure", "huggingface", "huggingfacetei", "google_genai", "vertexai", or "ollama", where "huggingface" uses sentence_transformers; defaults to "openai"
 - `EMBEDDINGS_MODEL`: (Optional) Set a valid embeddings model to use from the configured provider.
     - **Defaults**
     - openai: "text-embedding-3-small"
@@ -74,6 +74,7 @@ The following environment variables are required to run the application:
     - vertexai: "text-embedding-004"
     - ollama: "nomic-embed-text"
     - bedrock: "amazon.titan-embed-text-v1"
+    - google_genai: "gemini-embedding-001"
 - `RAG_AZURE_OPENAI_API_VERSION`: (Optional) Default is `2023-05-15`. The version of the Azure OpenAI API.
 - `RAG_AZURE_OPENAI_API_KEY`: (Optional) The API key for Azure OpenAI service.
     - Note: `AZURE_OPENAI_API_KEY` will work but `RAG_AZURE_OPENAI_API_KEY` will override it in order to not conflict with LibreChat setting.
@@ -87,6 +88,7 @@ The following environment variables are required to run the application:
 - `AWS_DEFAULT_REGION`: (Optional) defaults to `us-east-1`
 - `AWS_ACCESS_KEY_ID`: (Optional) needed for bedrock embeddings
 - `AWS_SECRET_ACCESS_KEY`: (Optional) needed for bedrock embeddings
+- `GOOGLE_API_KEY`, `GOOGLE_KEY`, `RAG_GOOGLE_API_KEY`: (Optional) Google API key for Google GenAI embeddings. Priority order: RAG_GOOGLE_API_KEY > GOOGLE_KEY > GOOGLE_API_KEY
 - `AWS_SESSION_TOKEN`: (Optional) may be needed for bedrock embeddings
 - `GOOGLE_APPLICATION_CREDENTIALS`: (Optional) needed for Google VertexAI embeddings. This should be a path to a service account credential file in JSON format, as accepted by [langchain](https://python.langchain.com/api_reference/google_vertexai/index.html)
 - `RAG_CHECK_EMBEDDING_CTX_LENGTH` (Optional) Default is true, disabling this will send raw input to the embedder, use this for custom embedding models.
@@ -175,3 +177,4 @@ Run the following commands to install pre-commit formatter, which uses [black](h
 pip install pre-commit
 pre-commit install
 ```
+
@@ -26,6 +26,7 @@ class EmbeddingsProvider(Enum):
     HUGGINGFACETEI = "huggingfacetei"
     OLLAMA = "ollama"
     BEDROCK = "bedrock"
+    GOOGLE_GENAI = "google_genai"
     GOOGLE_VERTEXAI = "vertexai"
 
 
@@ -186,6 +187,9 @@ async def dispatch(self, request, call_next):
 OLLAMA_BASE_URL = get_env_variable("OLLAMA_BASE_URL", "http://ollama:11434")
 AWS_ACCESS_KEY_ID = get_env_variable("AWS_ACCESS_KEY_ID", "")
 AWS_SECRET_ACCESS_KEY = get_env_variable("AWS_SECRET_ACCESS_KEY", "")
+GOOGLE_API_KEY = get_env_variable("GOOGLE_API_KEY", "")
+GOOGLE_KEY = get_env_variable("GOOGLE_KEY", GOOGLE_API_KEY)
+RAG_GOOGLE_API_KEY = get_env_variable("RAG_GOOGLE_API_KEY", GOOGLE_KEY)
 AWS_SESSION_TOKEN = get_env_variable("AWS_SESSION_TOKEN", "")
 GOOGLE_APPLICATION_CREDENTIALS = get_env_variable("GOOGLE_APPLICATION_CREDENTIALS", "")
 env_value = get_env_variable("RAG_CHECK_EMBEDDING_CTX_LENGTH", "True").lower()
@@ -231,6 +235,13 @@ def init_embeddings(provider, model):
         from langchain_ollama import OllamaEmbeddings
 
         return OllamaEmbeddings(model=model, base_url=OLLAMA_BASE_URL)
+    elif provider == EmbeddingsProvider.GOOGLE_GENAI:
+        from langchain_google_genai import GoogleGenerativeAIEmbeddings
+
+        return GoogleGenerativeAIEmbeddings(
+            model=model,
+            google_api_key=RAG_GOOGLE_API_KEY,
+        )
     elif provider == EmbeddingsProvider.GOOGLE_VERTEXAI:
         from langchain_google_vertexai import VertexAIEmbeddings
 
@@ -281,6 +292,8 @@ def init_embeddings(provider, model):
     EMBEDDINGS_MODEL = get_env_variable("EMBEDDINGS_MODEL", "text-embedding-004")
 elif EMBEDDINGS_PROVIDER == EmbeddingsProvider.OLLAMA:
     EMBEDDINGS_MODEL = get_env_variable("EMBEDDINGS_MODEL", "nomic-embed-text")
+elif EMBEDDINGS_PROVIDER == EmbeddingsProvider.GOOGLE_GENAI:
+    EMBEDDINGS_MODEL = get_env_variable("EMBEDDINGS_MODEL", "gemini-embedding-001")
 elif EMBEDDINGS_PROVIDER == EmbeddingsProvider.BEDROCK:
     EMBEDDINGS_MODEL = get_env_variable(
         "EMBEDDINGS_MODEL", "amazon.titan-embed-text-v1"
 
@@ -44,6 +44,94 @@
 router = APIRouter()
 
 
+def get_user_id(request: Request, entity_id: str = None) -> str:
+    """Extract user ID from request or entity_id."""
+    if not hasattr(request.state, "user"):
+        return entity_id if entity_id else "public"
+    else:
+        return entity_id if entity_id else request.state.user.get("id")
+
+
+async def save_upload_file_async(file: UploadFile, temp_file_path: str) -> None:
+    """Save uploaded file asynchronously."""
+    try:
+        async with aiofiles.open(temp_file_path, "wb") as temp_file:
+            chunk_size = 64 * 1024  # 64 KB
+            while content := await file.read(chunk_size):
+                await temp_file.write(content)
+    except Exception as e:
+        logger.error(
+            "Failed to save uploaded file | Path: %s | Error: %s | Traceback: %s",
+            temp_file_path,
+            str(e),
+            traceback.format_exc(),
+        )
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to save the uploaded file. Error: {str(e)}",
+        )
+
+
+def save_upload_file_sync(file: UploadFile, temp_file_path: str) -> None:
+    """Save uploaded file synchronously."""
+    try:
+        with open(temp_file_path, "wb") as temp_file:
+            copyfileobj(file.file, temp_file)
+    except Exception as e:
+        logger.error(
+            "Failed to save uploaded file | Path: %s | Error: %s | Traceback: %s",
+            temp_file_path,
+            str(e),
+            traceback.format_exc(),
+        )
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to save the uploaded file. Error: {str(e)}",
+        )
+
+
+async def load_file_content(
+    filename: str, content_type: str, file_path: str, executor
+) -> tuple:
+    """Load file content using appropriate loader."""
+    loader, known_type, file_ext = get_loader(filename, content_type, file_path)
+    data = await run_in_executor(executor, loader.load)
+
+    # Clean up temporary UTF-8 file if it was created for encoding conversion
+    cleanup_temp_encoding_file(loader)
+
+    return data, known_type, file_ext
+
+
+def extract_text_from_documents(documents: List[Document], file_ext: str) -> str:
+    """Extract text content from loaded documents."""
+    text_content = ""
+    if documents:
+        for doc in documents:
+            if hasattr(doc, "page_content"):
+                # Clean text if it's a PDF
+                if file_ext == "pdf":
+                    text_content += clean_text(doc.page_content) + "\n"
+                else:
+                    text_content += doc.page_content + "\n"
+
+    # Remove trailing newline
+    return text_content.rstrip("\n")
+
+
+async def cleanup_temp_file_async(file_path: str) -> None:
+    """Clean up temporary file asynchronously."""
+    try:
+        await aiofiles.os.remove(file_path)
+    except Exception as e:
+        logger.error(
+            "Failed to remove temporary file | Path: %s | Error: %s | Traceback: %s",
+            file_path,
+            str(e),
+            traceback.format_exc(),
+        )
+
+
 @router.get("/ids")
 async def get_all_ids(request: Request):
     try:
@@ -251,7 +339,12 @@ async def query_embeddings_by_file_id(
 
 
 def generate_digest(page_content: str):
-    hash_obj = hashlib.md5(page_content.encode())
+    try:
+        hash_obj = hashlib.md5(page_content.encode("utf-8"))
+    except UnicodeEncodeError:
+        hash_obj = hashlib.md5(
+            page_content.encode("utf-8", "ignore").decode("utf-8").encode("utf-8")
+        )
     return hash_obj.hexdigest()
 
 
@@ -383,40 +476,21 @@ async def embed_file(
     response_status = True
     response_message = "File processed successfully."
     known_type = None
-    if not hasattr(request.state, "user"):
-        user_id = entity_id if entity_id else "public"
-    else:
-        user_id = entity_id if entity_id else request.state.user.get("id")
 
+    user_id = get_user_id(request, entity_id)
     temp_base_path = os.path.join(RAG_UPLOAD_DIR, user_id)
     os.makedirs(temp_base_path, exist_ok=True)
     temp_file_path = os.path.join(RAG_UPLOAD_DIR, user_id, file.filename)
 
-    try:
-        async with aiofiles.open(temp_file_path, "wb") as temp_file:
-            chunk_size = 64 * 1024  # 64 KB
-            while content := await file.read(chunk_size):
-                await temp_file.write(content)
-    except Exception as e:
-        logger.error(
-            "Failed to save uploaded file | Path: %s | Error: %s | Traceback: %s",
-            temp_file_path,
-            str(e),
-            traceback.format_exc(),
-        )
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Failed to save the uploaded file. Error: {str(e)}",
-        )
+    await save_upload_file_async(file, temp_file_path)
 
     try:
-        loader, known_type, file_ext = get_loader(
-            file.filename, file.content_type, temp_file_path
+        data, known_type, file_ext = await load_file_content(
+            file.filename,
+            file.content_type,
+            temp_file_path,
+            request.app.state.thread_pool,
         )
-        data = await run_in_executor(request.app.state.thread_pool, loader.load)
-
-        # Clean up temporary UTF-8 file if it was created for encoding conversion
-        cleanup_temp_encoding_file(loader)
 
         result = await store_data_in_vector_db(
             data=data,
@@ -465,15 +539,7 @@ async def embed_file(
             detail=f"Error during file processing: {str(e)}",
         )
     finally:
-        try:
-            await aiofiles.os.remove(temp_file_path)
-        except Exception as e:
-            logger.error(
-                "Failed to remove temporary file | Path: %s | Error: %s | Traceback: %s",
-                temp_file_path,
-                str(e),
-                traceback.format_exc(),
-            )
+        await cleanup_temp_file_async(temp_file_path)
 
     return {
         "status": response_status,
@@ -539,32 +605,19 @@ async def embed_file_upload(
     uploaded_file: UploadFile = File(...),
     entity_id: str = Form(None),
 ):
+    user_id = get_user_id(request, entity_id)
     temp_file_path = os.path.join(RAG_UPLOAD_DIR, uploaded_file.filename)
 
-    if not hasattr(request.state, "user"):
-        user_id = entity_id if entity_id else "public"
-    else:
-        user_id = entity_id if entity_id else request.state.user.get("id")
-
-    try:
-        with open(temp_file_path, "wb") as temp_file:
-            copyfileobj(uploaded_file.file, temp_file)
-    except Exception as e:
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Failed to save the uploaded file. Error: {str(e)}",
-        )
+    save_upload_file_sync(uploaded_file, temp_file_path)
 
     try:
-        loader, known_type, file_ext = get_loader(
-            uploaded_file.filename, uploaded_file.content_type, temp_file_path
+        data, known_type, file_ext = await load_file_content(
+            uploaded_file.filename,
+            uploaded_file.content_type,
+            temp_file_path,
+            request.app.state.thread_pool,
         )
 
-        data = await run_in_executor(request.app.state.thread_pool, loader.load)
-
-        # Clean up temporary UTF-8 file if it was created for encoding conversion
-        cleanup_temp_encoding_file(loader)
-
         result = await store_data_in_vector_db(
             data,
             file_id,
@@ -651,7 +704,6 @@ async def query_embeddings_by_file_ids(request: Request, body: QueryMultipleBody
         )
         raise HTTPException(status_code=500, detail=str(e))
 
-
 @router.post("/rerank")
 async def rerank_documents_by_query(request: Request, body: QueryMultipleDocs):
     try:
@@ -690,3 +742,66 @@ async def rerank_documents_by_query(request: Request, body: QueryMultipleDocs):
             traceback.format_exc(),
         )
         raise HTTPException(status_code=500, detail=str(e))
+
+@router.post("/text")
+async def extract_text_from_file(
+    request: Request,
+    file_id: str = Form(...),
+    file: UploadFile = File(...),
+    entity_id: str = Form(None),
+):
+    """
+    Extract text content from an uploaded file without creating embeddings.
+    Returns the raw text content for text parsing purposes.
+    """
+    user_id = get_user_id(request, entity_id)
+    temp_base_path = os.path.join(RAG_UPLOAD_DIR, user_id)
+    os.makedirs(temp_base_path, exist_ok=True)
+    temp_file_path = os.path.join(RAG_UPLOAD_DIR, user_id, file.filename)
+
+    await save_upload_file_async(file, temp_file_path)
+
+    try:
+        data, known_type, file_ext = await load_file_content(
+            file.filename,
+            file.content_type,
+            temp_file_path,
+            request.app.state.thread_pool,
+        )
+
+        # Extract text content from loaded documents
+        text_content = extract_text_from_documents(data, file_ext)
+
+        return {
+            "text": text_content,
+            "file_id": file_id,
+            "filename": file.filename,
+            "known_type": known_type,
+        }
+
+    except HTTPException as http_exc:
+        logger.error(
+            "HTTP Exception in extract_text_from_file | Status: %d | Detail: %s",
+            http_exc.status_code,
+            http_exc.detail,
+        )
+        raise http_exc
+    except Exception as e:
+        logger.error(
+            "Error during text extraction | File: %s | Error: %s | Traceback: %s",
+            file.filename,
+            str(e),
+            traceback.format_exc(),
+        )
+        if "No pandoc was found" in str(e):
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
+            )
+        else:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=f"Error during text extraction: {str(e)}",
+            )
+    finally:
+        await cleanup_temp_file_async(temp_file_path)
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+.git`
`1`	`2`	`.idea`
`2`	`3`	`.venv`
`3`	`4`	`.env`