From 2921597c71d7ddda34b1a510705ab1bdddb19811 Mon Sep 17 00:00:00 2001
From: Philippe PRADOS <github@prados.fr>
Date: Tue, 7 Jan 2025 17:00:04 +0100
Subject: [PATCH] community[patch]: Refactoring PDF loaders: 01 prepare
 (#29062)

- **Refactoring PDF loaders step 1**: "community: Refactoring PDF
loaders to standardize approaches"

- **Description:** Declare CloudBlobLoader in __init__.py. file_path is
Union[str, PurePath] anywhere
- **Twitter handle:** pprados

This is one part of a larger Pull Request (PR) that is too large to be
submitted all at once.
This specific part focuses to prepare the update of all parsers.

For more details, see [PR
28970](https://github.com/langchain-ai/langchain/pull/28970).

@eyurtsev it's the start of a PR series.
---
 .../document_loaders/__init__.py              |  3 +
 .../document_loaders/parsers/pdf.py           | 34 +++----
 .../document_loaders/pdf.py                   | 93 ++++++++++---------
 .../parsers/test_pdf_parsers.py               |  2 +-
 .../document_loaders/test_pdf.py              | 42 +++++----
 .../document_loaders/test_imports.py          |  1 +
 6 files changed, 90 insertions(+), 85 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py
index 8a56f918ab6ac..c91345daa6b03 100644
--- a/libs/community/langchain_community/document_loaders/__init__.py
+++ b/libs/community/langchain_community/document_loaders/__init__.py
@@ -87,6 +87,7 @@
     from langchain_community.document_loaders.blob_loaders import (
         Blob,
         BlobLoader,
+        CloudBlobLoader,
         FileSystemBlobLoader,
         YoutubeAudioLoader,
     )
@@ -574,6 +575,7 @@
     "CSVLoader": "langchain_community.document_loaders.csv_loader",
     "CassandraLoader": "langchain_community.document_loaders.cassandra",
     "ChatGPTLoader": "langchain_community.document_loaders.chatgpt",
+    "CloudBlobLoader": "langchain_community.document_loaders.blob_loaders",
     "CoNLLULoader": "langchain_community.document_loaders.conllu",
     "CollegeConfidentialLoader": "langchain_community.document_loaders.college_confidential",  # noqa: E501
     "ConcurrentLoader": "langchain_community.document_loaders.concurrent",
@@ -781,6 +783,7 @@ def __getattr__(name: str) -> Any:
     "CSVLoader",
     "CassandraLoader",
     "ChatGPTLoader",
+    "CloudBlobLoader",
     "CoNLLULoader",
     "CollegeConfidentialLoader",
     "ConcurrentLoader",
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
index 702d5998dd7c2..00b4510ee660d 100644
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -6,7 +6,6 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Dict,
     Iterable,
     Iterator,
     Mapping,
@@ -23,15 +22,13 @@
 from langchain_community.document_loaders.blob_loaders import Blob
 
 if TYPE_CHECKING:
-    import fitz.fitz
-    import pdfminer.layout
-    import pdfplumber.page
-    import pypdf._page
-    import pypdfium2._helpers.page
-    from pypdf import PageObject
+    import fitz
+    import pdfminer
+    import pdfplumber
+    import pypdf
+    import pypdfium2
     from textractor.data.text_linearization_config import TextLinearizationConfig
 
-
 _PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
 _PDF_FILTER_WITHOUT_LOSS = [
     "LZWDecode",
@@ -90,7 +87,7 @@ def __init__(
         extract_images: bool = False,
         *,
         extraction_mode: str = "plain",
-        extraction_kwargs: Optional[Dict[str, Any]] = None,
+        extraction_kwargs: Optional[dict[str, Any]] = None,
     ):
         self.password = password
         self.extract_images = extract_images
@@ -107,7 +104,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
                 "`pip install pypdf`"
             )
 
-        def _extract_text_from_page(page: "PageObject") -> str:
+        def _extract_text_from_page(page: pypdf.PageObject) -> str:
             """
             Extract text from image given the version of pypdf.
             """
@@ -126,12 +123,13 @@ def _extract_text_from_page(page: "PageObject") -> str:
                 Document(
                     page_content=_extract_text_from_page(page=page)
                     + self._extract_images_from_page(page),
-                    metadata={"source": blob.source, "page": page_number},  # type: ignore[attr-defined]
+                    metadata={"source": blob.source, "page": page_number},
+                    # type: ignore[attr-defined]
                 )
                 for page_number, page in enumerate(pdf_reader.pages)
             ]
 
-    def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
+    def _extract_images_from_page(self, page: pypdf.PageObject) -> str:
         """Extract images from page and get the text with RapidOCR."""
         if not self.extract_images or "/XObject" not in page["/Resources"].keys():  # type: ignore[attr-defined]
             return ""
@@ -307,9 +305,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
                 for page in doc
             ]
 
-    def _get_page_content(
-        self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
-    ) -> str:
+    def _get_page_content(self, doc: fitz.Document, page: fitz.Page, blob: Blob) -> str:
         """
         Get the text of the page using PyMuPDF and RapidOCR and issue a warning
         if it is empty.
@@ -327,7 +323,7 @@ def _get_page_content(
         return content
 
     def _extract_metadata(
-        self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
+        self, doc: fitz.Document, page: fitz.Page, blob: Blob
     ) -> dict:
         """Extract metadata from the document and page."""
         return dict(
@@ -344,9 +340,7 @@ def _extract_metadata(
             },
         )
 
-    def _extract_images_from_page(
-        self, doc: fitz.fitz.Document, page: fitz.fitz.Page
-    ) -> str:
+    def _extract_images_from_page(self, doc: fitz.Document, page: fitz.Page) -> str:
         """Extract images from page and get the text with RapidOCR."""
         if not self.extract_images:
             return ""
@@ -558,7 +552,7 @@ def __init__(
         textract_features: Optional[Sequence[int]] = None,
         client: Optional[Any] = None,
         *,
-        linearization_config: Optional["TextLinearizationConfig"] = None,
+        linearization_config: Optional[TextLinearizationConfig] = None,
     ) -> None:
         """Initializes the parser.
 
diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py
index 8e7d0152d3dc0..c8ee848a733a9 100644
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@@ -6,17 +6,17 @@
 import time
 from abc import ABC
 from io import StringIO
-from pathlib import Path
+from pathlib import Path, PurePath
 from typing import (
     TYPE_CHECKING,
     Any,
-    Dict,
+    BinaryIO,
     Iterator,
-    List,
     Mapping,
     Optional,
     Sequence,
     Union,
+    cast,
 )
 from urllib.parse import urlparse
 
@@ -68,7 +68,7 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
     https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf
     """
 
-    def _get_elements(self) -> List:
+    def _get_elements(self) -> list:
         from unstructured.partition.pdf import partition_pdf
 
         return partition_pdf(filename=self.file_path, **self.unstructured_kwargs)  # type: ignore[arg-type]
@@ -81,7 +81,9 @@ class BasePDFLoader(BaseLoader, ABC):
         clean up the temporary file after completion.
     """
 
-    def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None):
+    def __init__(
+        self, file_path: Union[str, PurePath], *, headers: Optional[dict] = None
+    ):
         """Initialize with a file path.
 
         Args:
@@ -154,7 +156,7 @@ def source(self) -> str:
 class OnlinePDFLoader(BasePDFLoader):
     """Load online `PDF`."""
 
-    def load(self) -> List[Document]:
+    def load(self) -> list[Document]:
         """Load documents."""
         loader = UnstructuredPDFLoader(str(self.file_path))
         return loader.load()
@@ -223,13 +225,13 @@ class PyPDFLoader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: str,
+        file_path: Union[str, PurePath],
         password: Optional[Union[str, bytes]] = None,
-        headers: Optional[Dict] = None,
+        headers: Optional[dict] = None,
         extract_images: bool = False,
         *,
         extraction_mode: str = "plain",
-        extraction_kwargs: Optional[Dict] = None,
+        extraction_kwargs: Optional[dict] = None,
     ) -> None:
         """Initialize with a file path."""
         try:
@@ -262,9 +264,9 @@ class PyPDFium2Loader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: str,
+        file_path: Union[str, PurePath],
         *,
-        headers: Optional[Dict] = None,
+        headers: Optional[dict] = None,
         extract_images: bool = False,
     ):
         """Initialize with a file path."""
@@ -290,7 +292,7 @@ class PyPDFDirectoryLoader(BaseLoader):
 
     def __init__(
         self,
-        path: Union[str, Path],
+        path: Union[str, PurePath],
         glob: str = "**/[!.]*.pdf",
         silent_errors: bool = False,
         load_hidden: bool = False,
@@ -308,7 +310,7 @@ def __init__(
     def _is_visible(path: Path) -> bool:
         return not any(part.startswith(".") for part in path.parts)
 
-    def load(self) -> List[Document]:
+    def load(self) -> list[Document]:
         p = Path(self.path)
         docs = []
         items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
@@ -334,9 +336,9 @@ class PDFMinerLoader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: str,
+        file_path: Union[str, PurePath],
         *,
-        headers: Optional[Dict] = None,
+        headers: Optional[dict] = None,
         extract_images: bool = False,
         concatenate_pages: bool = True,
     ) -> None:
@@ -374,7 +376,9 @@ def lazy_load(
 class PDFMinerPDFasHTMLLoader(BasePDFLoader):
     """Load `PDF` files as HTML content using `PDFMiner`."""
 
-    def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
+    def __init__(
+        self, file_path: Union[str, PurePath], *, headers: Optional[dict] = None
+    ):
         """Initialize with a file path."""
         try:
             from pdfminer.high_level import extract_text_to_fp  # noqa:F401
@@ -395,14 +399,14 @@ def lazy_load(self) -> Iterator[Document]:
         output_string = StringIO()
         with open_filename(self.file_path, "rb") as fp:
             extract_text_to_fp(
-                fp,
+                cast(BinaryIO, fp),
                 output_string,
                 codec="",
                 laparams=LAParams(),
                 output_type="html",
             )
         metadata = {
-            "source": self.file_path if self.web_path is None else self.web_path
+            "source": str(self.file_path) if self.web_path is None else self.web_path
         }
         yield Document(page_content=output_string.getvalue(), metadata=metadata)
 
@@ -412,9 +416,9 @@ class PyMuPDFLoader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: str,
+        file_path: Union[str, PurePath],
         *,
-        headers: Optional[Dict] = None,
+        headers: Optional[dict] = None,
         extract_images: bool = False,
         **kwargs: Any,
     ) -> None:
@@ -447,7 +451,7 @@ def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:
             blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
         yield from parser.lazy_parse(blob)
 
-    def load(self, **kwargs: Any) -> List[Document]:
+    def load(self, **kwargs: Any) -> list[Document]:
         return list(self._lazy_load(**kwargs))
 
     def lazy_load(self) -> Iterator[Document]:
@@ -461,11 +465,11 @@ class MathpixPDFLoader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: str,
+        file_path: Union[str, PurePath],
         processed_file_format: str = "md",
         max_wait_time_seconds: int = 500,
         should_clean_pdf: bool = False,
-        extra_request_data: Optional[Dict[str, Any]] = None,
+        extra_request_data: Optional[dict[str, Any]] = None,
         **kwargs: Any,
     ) -> None:
         """Initialize with a file path.
@@ -499,7 +503,7 @@ def __init__(
         self.should_clean_pdf = should_clean_pdf
 
     @property
-    def _mathpix_headers(self) -> Dict[str, str]:
+    def _mathpix_headers(self) -> dict[str, str]:
         return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key}
 
     @property
@@ -515,7 +519,7 @@ def data(self) -> dict:
         return {"options_json": json.dumps(options)}
 
     def send_pdf(self) -> str:
-        with open(self.file_path, "rb") as f:
+        with open(str(self.file_path), "rb") as f:
             files = {"file": f}
             response = requests.post(
                 self.url, headers=self._mathpix_headers, files=files, data=self.data
@@ -562,7 +566,7 @@ def wait_for_processing(self, pdf_id: str) -> None:
                 # This indicates an error with the PDF processing
                 raise ValueError("Unable to retrieve PDF from Mathpix")
             else:
-                print(f"Status: {status}, waiting for processing to complete")  # noqa: T201
+                logger.info("Status: %s, waiting for processing to complete", status)
                 time.sleep(5)
         raise TimeoutError
 
@@ -572,8 +576,7 @@ def get_processed_pdf(self, pdf_id: str) -> str:
         response = requests.get(url, headers=self._mathpix_headers)
         return response.content.decode("utf-8")
 
-    @staticmethod
-    def clean_pdf(contents: str) -> str:
+    def clean_pdf(self, contents: str) -> str:
         """Clean the PDF file.
 
         Args:
@@ -596,7 +599,7 @@ def clean_pdf(contents: str) -> str:
         )
         return contents
 
-    def load(self) -> List[Document]:
+    def load(self) -> list[Document]:
         pdf_id = self.send_pdf()
         contents = self.get_processed_pdf(pdf_id)
         if self.should_clean_pdf:
@@ -610,10 +613,10 @@ class PDFPlumberLoader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: str,
+        file_path: Union[str, PurePath],
         text_kwargs: Optional[Mapping[str, Any]] = None,
         dedupe: bool = False,
-        headers: Optional[Dict] = None,
+        headers: Optional[dict] = None,
         extract_images: bool = False,
     ) -> None:
         """Initialize with a file path."""
@@ -630,7 +633,7 @@ def __init__(
         self.dedupe = dedupe
         self.extract_images = extract_images
 
-    def load(self) -> List[Document]:
+    def load(self) -> list[Document]:
         """Load file."""
 
         parser = PDFPlumberParser(
@@ -669,13 +672,13 @@ class AmazonTextractPDFLoader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: str,
+        file_path: Union[str, PurePath],
         textract_features: Optional[Sequence[str]] = None,
         client: Optional[Any] = None,
         credentials_profile_name: Optional[str] = None,
         region_name: Optional[str] = None,
         endpoint_url: Optional[str] = None,
-        headers: Optional[Dict] = None,
+        headers: Optional[dict] = None,
         *,
         linearization_config: Optional["TextLinearizationConfig"] = None,
     ) -> None:
@@ -743,7 +746,7 @@ def __init__(
             linearization_config=linearization_config,
         )
 
-    def load(self) -> List[Document]:
+    def load(self) -> list[Document]:
         """Load given path as pages."""
         return list(self.lazy_load())
 
@@ -758,7 +761,7 @@ def lazy_load(
         if self.web_path and self._is_s3_url(self.web_path):
             blob = Blob(path=self.web_path)  # type: ignore[call-arg] # type: ignore[misc]
         else:
-            blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
+            blob = Blob.from_path(self.file_path)
             if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1:
                 raise ValueError(
                     f"the file {blob.path} is a multi-page document, \
@@ -792,7 +795,9 @@ def _get_number_of_pages(blob: Blob) -> int:  # type: ignore[valid-type]
         elif blob.mimetype in ["image/png", "image/jpeg"]:  # type: ignore[attr-defined]
             return 1
         else:
-            raise ValueError(f"unsupported mime type: {blob.mimetype}")  # type: ignore[attr-defined]
+            raise ValueError(  # type: ignore[attr-defined]
+                f"unsupported mime type: {blob.mimetype}"
+            )
 
 
 class DedocPDFLoader(DedocBaseLoader):
@@ -887,7 +892,7 @@ def _make_config(self) -> dict:
         from dedoc.utils.langchain import make_manager_pdf_config
 
         return make_manager_pdf_config(
-            file_path=self.file_path,
+            file_path=str(self.file_path),
             parsing_params=self.parsing_parameters,
             split=self.split,
         )
@@ -898,10 +903,10 @@ class DocumentIntelligenceLoader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: str,
+        file_path: Union[str, PurePath],
         client: Any,
         model: str = "prebuilt-document",
-        headers: Optional[Dict] = None,
+        headers: Optional[dict] = None,
     ) -> None:
         """
         Initialize the object for file processing with Azure Document Intelligence
@@ -930,10 +935,10 @@ def __init__(
         ... )
         """
 
-        self.parser = DocumentIntelligenceParser(client=client, model=model)
         super().__init__(file_path, headers=headers)
+        self.parser = DocumentIntelligenceParser(client=client, model=model)
 
-    def load(self) -> List[Document]:
+    def load(self) -> list[Document]:
         """Load given path as pages."""
         return list(self.lazy_load())
 
@@ -964,7 +969,7 @@ class ZeroxPDFLoader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: Union[str, Path],
+        file_path: Union[str, PurePath],
         model: str = "gpt-4o-mini",
         **zerox_kwargs: Any,
     ) -> None:
@@ -1005,7 +1010,7 @@ def lazy_load(self) -> Iterator[Document]:
 
         # Directly call asyncio.run to execute zerox synchronously
         zerox_output = asyncio.run(
-            zerox(file_path=self.file_path, model=self.model, **self.zerox_kwargs)
+            zerox(file_path=str(self.file_path), model=self.model, **self.zerox_kwargs)
         )
 
         # Convert zerox output to Document instances and yield them
diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
index 928c26898c9e7..47163c859520b 100644
--- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@@ -61,7 +61,7 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) ->
     assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
 
     if splits_by_page:
-        assert metadata["page"] == 0
+        assert int(metadata["page"]) == 0
 
 
 def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False) -> None:
diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py
index 50c9fde29d918..66ce4ce3fa587 100644
--- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py
+++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py
@@ -1,3 +1,4 @@
+import os
 from pathlib import Path
 from typing import Sequence, Union
 
@@ -17,7 +18,7 @@
 def test_unstructured_pdf_loader_elements_mode() -> None:
     """Test unstructured loader with various modes."""
     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
-    loader = UnstructuredPDFLoader(str(file_path), mode="elements")
+    loader = UnstructuredPDFLoader(file_path, mode="elements")
     docs = loader.load()
 
     assert len(docs) == 2
@@ -26,7 +27,7 @@ def test_unstructured_pdf_loader_elements_mode() -> None:
 def test_unstructured_pdf_loader_paged_mode() -> None:
     """Test unstructured loader with various modes."""
     file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
-    loader = UnstructuredPDFLoader(str(file_path), mode="paged")
+    loader = UnstructuredPDFLoader(file_path, mode="paged")
     docs = loader.load()
 
     assert len(docs) == 16
@@ -35,7 +36,7 @@ def test_unstructured_pdf_loader_paged_mode() -> None:
 def test_unstructured_pdf_loader_default_mode() -> None:
     """Test unstructured loader."""
     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
-    loader = UnstructuredPDFLoader(str(file_path))
+    loader = UnstructuredPDFLoader(file_path)
     docs = loader.load()
 
     assert len(docs) == 1
@@ -44,26 +45,26 @@ def test_unstructured_pdf_loader_default_mode() -> None:
 def test_pdfminer_loader() -> None:
     """Test PDFMiner loader."""
     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
-    loader = PDFMinerLoader(str(file_path))
+    loader = PDFMinerLoader(file_path)
     docs = loader.load()
 
     assert len(docs) == 1
 
     file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
-    loader = PDFMinerLoader(str(file_path))
+    loader = PDFMinerLoader(file_path)
 
     docs = loader.load()
     assert len(docs) == 1
 
     # Verify that concatenating pages parameter works
     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
-    loader = PDFMinerLoader(str(file_path), concatenate_pages=True)
+    loader = PDFMinerLoader(file_path, concatenate_pages=True)
     docs = loader.load()
 
     assert len(docs) == 1
 
     file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
-    loader = PDFMinerLoader(str(file_path), concatenate_pages=False)
+    loader = PDFMinerLoader(file_path, concatenate_pages=False)
 
     docs = loader.load()
     assert len(docs) == 16
@@ -72,13 +73,13 @@ def test_pdfminer_loader() -> None:
 def test_pdfminer_pdf_as_html_loader() -> None:
     """Test PDFMinerPDFasHTMLLoader."""
     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
-    loader = PDFMinerPDFasHTMLLoader(str(file_path))
+    loader = PDFMinerPDFasHTMLLoader(file_path)
     docs = loader.load()
 
     assert len(docs) == 1
 
     file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
-    loader = PDFMinerPDFasHTMLLoader(str(file_path))
+    loader = PDFMinerPDFasHTMLLoader(file_path)
 
     docs = loader.load()
     assert len(docs) == 1
@@ -87,13 +88,13 @@ def test_pdfminer_pdf_as_html_loader() -> None:
 def test_pypdfium2_loader() -> None:
     """Test PyPDFium2Loader."""
     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
-    loader = PyPDFium2Loader(str(file_path))
+    loader = PyPDFium2Loader(file_path)
     docs = loader.load()
 
     assert len(docs) == 1
 
     file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
-    loader = PyPDFium2Loader(str(file_path))
+    loader = PyPDFium2Loader(file_path)
 
     docs = loader.load()
     assert len(docs) == 16
@@ -102,13 +103,13 @@ def test_pypdfium2_loader() -> None:
 def test_pymupdf_loader() -> None:
     """Test PyMuPDF loader."""
     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
-    loader = PyMuPDFLoader(str(file_path))
+    loader = PyMuPDFLoader(file_path)
 
     docs = loader.load()
     assert len(docs) == 1
 
     file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
-    loader = PyMuPDFLoader(str(file_path))
+    loader = PyMuPDFLoader(file_path)
 
     docs = loader.load()
     assert len(docs) == 16
@@ -123,20 +124,21 @@ def test_pymupdf_loader() -> None:
     assert len(docs) == 1
 
 
+@pytest.mark.skipif(
+    not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found"
+)
 def test_mathpix_loader() -> None:
     file_path = Path(__file__).parent.parent / "examples/hello.pdf"
-    loader = MathpixPDFLoader(str(file_path))
+    loader = MathpixPDFLoader(file_path)
     docs = loader.load()
 
     assert len(docs) == 1
-    print(docs[0].page_content)  # noqa: T201
 
     file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
-    loader = MathpixPDFLoader(str(file_path))
+    loader = MathpixPDFLoader(file_path)
 
     docs = loader.load()
     assert len(docs) == 1
-    print(docs[0].page_content)  # noqa: T201
 
 
 @pytest.mark.parametrize(
@@ -187,8 +189,8 @@ def test_mathpix_loader() -> None:
             1,
             False,
         ),
-        (str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False),
-        (str(Path(__file__).parent.parent / "examples/hello.pdf"), [], 1, False),
+        (Path(__file__).parent.parent / "examples/hello.pdf", ["FORMS"], 1, False),
+        (Path(__file__).parent.parent / "examples/hello.pdf", [], 1, False),
         (
             "s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf",
             ["FORMS", "TABLES", "LAYOUT"],
@@ -222,7 +224,7 @@ def test_amazontextract_loader(
 @pytest.mark.skip(reason="Requires AWS credentials to run")
 def test_amazontextract_loader_failures() -> None:
     # 2-page PDF local file system
-    two_page_pdf = str(
+    two_page_pdf = (
         Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
     )
     loader = AmazonTextractPDFLoader(two_page_pdf)
diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py
index ddeaf734b0fe8..33c988d1b9294 100644
--- a/libs/community/tests/unit_tests/document_loaders/test_imports.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py
@@ -43,6 +43,7 @@
     "CassandraLoader",
     "CSVLoader",
     "ChatGPTLoader",
+    "CloudBlobLoader",
     "CoNLLULoader",
     "CollegeConfidentialLoader",
     "ConcurrentLoader",