From 2921597c71d7ddda34b1a510705ab1bdddb19811 Mon Sep 17 00:00:00 2001 From: Philippe PRADOS Date: Tue, 7 Jan 2025 17:00:04 +0100 Subject: [PATCH] community[patch]: Refactoring PDF loaders: 01 prepare (#29062) - **Refactoring PDF loaders step 1**: "community: Refactoring PDF loaders to standardize approaches" - **Description:** Declare CloudBlobLoader in __init__.py. file_path is Union[str, PurePath] anywhere - **Twitter handle:** pprados This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses to prepare the update of all parsers. For more details, see [PR 28970](https://github.com/langchain-ai/langchain/pull/28970). @eyurtsev it's the start of a PR series. --- .../document_loaders/__init__.py | 3 + .../document_loaders/parsers/pdf.py | 34 +++---- .../document_loaders/pdf.py | 93 ++++++++++--------- .../parsers/test_pdf_parsers.py | 2 +- .../document_loaders/test_pdf.py | 42 +++++---- .../document_loaders/test_imports.py | 1 + 6 files changed, 90 insertions(+), 85 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index 8a56f918ab6ac..c91345daa6b03 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -87,6 +87,7 @@ from langchain_community.document_loaders.blob_loaders import ( Blob, BlobLoader, + CloudBlobLoader, FileSystemBlobLoader, YoutubeAudioLoader, ) @@ -574,6 +575,7 @@ "CSVLoader": "langchain_community.document_loaders.csv_loader", "CassandraLoader": "langchain_community.document_loaders.cassandra", "ChatGPTLoader": "langchain_community.document_loaders.chatgpt", + "CloudBlobLoader": "langchain_community.document_loaders.blob_loaders", "CoNLLULoader": "langchain_community.document_loaders.conllu", "CollegeConfidentialLoader": "langchain_community.document_loaders.college_confidential", # noqa: E501 "ConcurrentLoader": "langchain_community.document_loaders.concurrent", @@ -781,6 +783,7 @@ def __getattr__(name: str) -> Any: "CSVLoader", "CassandraLoader", "ChatGPTLoader", + "CloudBlobLoader", "CoNLLULoader", "CollegeConfidentialLoader", "ConcurrentLoader", diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 702d5998dd7c2..00b4510ee660d 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -6,7 +6,6 @@ from typing import ( TYPE_CHECKING, Any, - Dict, Iterable, Iterator, Mapping, @@ -23,15 +22,13 @@ from langchain_community.document_loaders.blob_loaders import Blob if TYPE_CHECKING: - import fitz.fitz - import pdfminer.layout - import pdfplumber.page - import pypdf._page - import pypdfium2._helpers.page - from pypdf import PageObject + import fitz + import pdfminer + import pdfplumber + import pypdf + import pypdfium2 from textractor.data.text_linearization_config import TextLinearizationConfig - _PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"] _PDF_FILTER_WITHOUT_LOSS = [ "LZWDecode", @@ -90,7 +87,7 @@ def __init__( extract_images: bool = False, *, extraction_mode: str = "plain", - extraction_kwargs: Optional[Dict[str, Any]] = None, + extraction_kwargs: Optional[dict[str, Any]] = None, ): self.password = password self.extract_images = extract_images @@ -107,7 +104,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty "`pip install pypdf`" ) - def _extract_text_from_page(page: "PageObject") -> str: + def _extract_text_from_page(page: pypdf.PageObject) -> str: """ Extract text from image given the version of pypdf. """ @@ -126,12 +123,13 @@ def _extract_text_from_page(page: "PageObject") -> str: Document( page_content=_extract_text_from_page(page=page) + self._extract_images_from_page(page), - metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined] + metadata={"source": blob.source, "page": page_number}, + # type: ignore[attr-defined] ) for page_number, page in enumerate(pdf_reader.pages) ] - def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str: + def _extract_images_from_page(self, page: pypdf.PageObject) -> str: """Extract images from page and get the text with RapidOCR.""" if not self.extract_images or "/XObject" not in page["/Resources"].keys(): # type: ignore[attr-defined] return "" @@ -307,9 +305,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty for page in doc ] - def _get_page_content( - self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob - ) -> str: + def _get_page_content(self, doc: fitz.Document, page: fitz.Page, blob: Blob) -> str: """ Get the text of the page using PyMuPDF and RapidOCR and issue a warning if it is empty. @@ -327,7 +323,7 @@ def _get_page_content( return content def _extract_metadata( - self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob + self, doc: fitz.Document, page: fitz.Page, blob: Blob ) -> dict: """Extract metadata from the document and page.""" return dict( @@ -344,9 +340,7 @@ def _extract_metadata( }, ) - def _extract_images_from_page( - self, doc: fitz.fitz.Document, page: fitz.fitz.Page - ) -> str: + def _extract_images_from_page(self, doc: fitz.Document, page: fitz.Page) -> str: """Extract images from page and get the text with RapidOCR.""" if not self.extract_images: return "" @@ -558,7 +552,7 @@ def __init__( textract_features: Optional[Sequence[int]] = None, client: Optional[Any] = None, *, - linearization_config: Optional["TextLinearizationConfig"] = None, + linearization_config: Optional[TextLinearizationConfig] = None, ) -> None: """Initializes the parser. diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 8e7d0152d3dc0..c8ee848a733a9 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -6,17 +6,17 @@ import time from abc import ABC from io import StringIO -from pathlib import Path +from pathlib import Path, PurePath from typing import ( TYPE_CHECKING, Any, - Dict, + BinaryIO, Iterator, - List, Mapping, Optional, Sequence, Union, + cast, ) from urllib.parse import urlparse @@ -68,7 +68,7 @@ class UnstructuredPDFLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf """ - def _get_elements(self) -> List: + def _get_elements(self) -> list: from unstructured.partition.pdf import partition_pdf return partition_pdf(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type] @@ -81,7 +81,9 @@ class BasePDFLoader(BaseLoader, ABC): clean up the temporary file after completion. """ - def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None): + def __init__( + self, file_path: Union[str, PurePath], *, headers: Optional[dict] = None + ): """Initialize with a file path. Args: @@ -154,7 +156,7 @@ def source(self) -> str: class OnlinePDFLoader(BasePDFLoader): """Load online `PDF`.""" - def load(self) -> List[Document]: + def load(self) -> list[Document]: """Load documents.""" loader = UnstructuredPDFLoader(str(self.file_path)) return loader.load() @@ -223,13 +225,13 @@ class PyPDFLoader(BasePDFLoader): def __init__( self, - file_path: str, + file_path: Union[str, PurePath], password: Optional[Union[str, bytes]] = None, - headers: Optional[Dict] = None, + headers: Optional[dict] = None, extract_images: bool = False, *, extraction_mode: str = "plain", - extraction_kwargs: Optional[Dict] = None, + extraction_kwargs: Optional[dict] = None, ) -> None: """Initialize with a file path.""" try: @@ -262,9 +264,9 @@ class PyPDFium2Loader(BasePDFLoader): def __init__( self, - file_path: str, + file_path: Union[str, PurePath], *, - headers: Optional[Dict] = None, + headers: Optional[dict] = None, extract_images: bool = False, ): """Initialize with a file path.""" @@ -290,7 +292,7 @@ class PyPDFDirectoryLoader(BaseLoader): def __init__( self, - path: Union[str, Path], + path: Union[str, PurePath], glob: str = "**/[!.]*.pdf", silent_errors: bool = False, load_hidden: bool = False, @@ -308,7 +310,7 @@ def __init__( def _is_visible(path: Path) -> bool: return not any(part.startswith(".") for part in path.parts) - def load(self) -> List[Document]: + def load(self) -> list[Document]: p = Path(self.path) docs = [] items = p.rglob(self.glob) if self.recursive else p.glob(self.glob) @@ -334,9 +336,9 @@ class PDFMinerLoader(BasePDFLoader): def __init__( self, - file_path: str, + file_path: Union[str, PurePath], *, - headers: Optional[Dict] = None, + headers: Optional[dict] = None, extract_images: bool = False, concatenate_pages: bool = True, ) -> None: @@ -374,7 +376,9 @@ def lazy_load( class PDFMinerPDFasHTMLLoader(BasePDFLoader): """Load `PDF` files as HTML content using `PDFMiner`.""" - def __init__(self, file_path: str, *, headers: Optional[Dict] = None): + def __init__( + self, file_path: Union[str, PurePath], *, headers: Optional[dict] = None + ): """Initialize with a file path.""" try: from pdfminer.high_level import extract_text_to_fp # noqa:F401 @@ -395,14 +399,14 @@ def lazy_load(self) -> Iterator[Document]: output_string = StringIO() with open_filename(self.file_path, "rb") as fp: extract_text_to_fp( - fp, + cast(BinaryIO, fp), output_string, codec="", laparams=LAParams(), output_type="html", ) metadata = { - "source": self.file_path if self.web_path is None else self.web_path + "source": str(self.file_path) if self.web_path is None else self.web_path } yield Document(page_content=output_string.getvalue(), metadata=metadata) @@ -412,9 +416,9 @@ class PyMuPDFLoader(BasePDFLoader): def __init__( self, - file_path: str, + file_path: Union[str, PurePath], *, - headers: Optional[Dict] = None, + headers: Optional[dict] = None, extract_images: bool = False, **kwargs: Any, ) -> None: @@ -447,7 +451,7 @@ def _lazy_load(self, **kwargs: Any) -> Iterator[Document]: blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] yield from parser.lazy_parse(blob) - def load(self, **kwargs: Any) -> List[Document]: + def load(self, **kwargs: Any) -> list[Document]: return list(self._lazy_load(**kwargs)) def lazy_load(self) -> Iterator[Document]: @@ -461,11 +465,11 @@ class MathpixPDFLoader(BasePDFLoader): def __init__( self, - file_path: str, + file_path: Union[str, PurePath], processed_file_format: str = "md", max_wait_time_seconds: int = 500, should_clean_pdf: bool = False, - extra_request_data: Optional[Dict[str, Any]] = None, + extra_request_data: Optional[dict[str, Any]] = None, **kwargs: Any, ) -> None: """Initialize with a file path. @@ -499,7 +503,7 @@ def __init__( self.should_clean_pdf = should_clean_pdf @property - def _mathpix_headers(self) -> Dict[str, str]: + def _mathpix_headers(self) -> dict[str, str]: return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key} @property @@ -515,7 +519,7 @@ def data(self) -> dict: return {"options_json": json.dumps(options)} def send_pdf(self) -> str: - with open(self.file_path, "rb") as f: + with open(str(self.file_path), "rb") as f: files = {"file": f} response = requests.post( self.url, headers=self._mathpix_headers, files=files, data=self.data @@ -562,7 +566,7 @@ def wait_for_processing(self, pdf_id: str) -> None: # This indicates an error with the PDF processing raise ValueError("Unable to retrieve PDF from Mathpix") else: - print(f"Status: {status}, waiting for processing to complete") # noqa: T201 + logger.info("Status: %s, waiting for processing to complete", status) time.sleep(5) raise TimeoutError @@ -572,8 +576,7 @@ def get_processed_pdf(self, pdf_id: str) -> str: response = requests.get(url, headers=self._mathpix_headers) return response.content.decode("utf-8") - @staticmethod - def clean_pdf(contents: str) -> str: + def clean_pdf(self, contents: str) -> str: """Clean the PDF file. Args: @@ -596,7 +599,7 @@ def clean_pdf(contents: str) -> str: ) return contents - def load(self) -> List[Document]: + def load(self) -> list[Document]: pdf_id = self.send_pdf() contents = self.get_processed_pdf(pdf_id) if self.should_clean_pdf: @@ -610,10 +613,10 @@ class PDFPlumberLoader(BasePDFLoader): def __init__( self, - file_path: str, + file_path: Union[str, PurePath], text_kwargs: Optional[Mapping[str, Any]] = None, dedupe: bool = False, - headers: Optional[Dict] = None, + headers: Optional[dict] = None, extract_images: bool = False, ) -> None: """Initialize with a file path.""" @@ -630,7 +633,7 @@ def __init__( self.dedupe = dedupe self.extract_images = extract_images - def load(self) -> List[Document]: + def load(self) -> list[Document]: """Load file.""" parser = PDFPlumberParser( @@ -669,13 +672,13 @@ class AmazonTextractPDFLoader(BasePDFLoader): def __init__( self, - file_path: str, + file_path: Union[str, PurePath], textract_features: Optional[Sequence[str]] = None, client: Optional[Any] = None, credentials_profile_name: Optional[str] = None, region_name: Optional[str] = None, endpoint_url: Optional[str] = None, - headers: Optional[Dict] = None, + headers: Optional[dict] = None, *, linearization_config: Optional["TextLinearizationConfig"] = None, ) -> None: @@ -743,7 +746,7 @@ def __init__( linearization_config=linearization_config, ) - def load(self) -> List[Document]: + def load(self) -> list[Document]: """Load given path as pages.""" return list(self.lazy_load()) @@ -758,7 +761,7 @@ def lazy_load( if self.web_path and self._is_s3_url(self.web_path): blob = Blob(path=self.web_path) # type: ignore[call-arg] # type: ignore[misc] else: - blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] + blob = Blob.from_path(self.file_path) if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1: raise ValueError( f"the file {blob.path} is a multi-page document, \ @@ -792,7 +795,9 @@ def _get_number_of_pages(blob: Blob) -> int: # type: ignore[valid-type] elif blob.mimetype in ["image/png", "image/jpeg"]: # type: ignore[attr-defined] return 1 else: - raise ValueError(f"unsupported mime type: {blob.mimetype}") # type: ignore[attr-defined] + raise ValueError( # type: ignore[attr-defined] + f"unsupported mime type: {blob.mimetype}" + ) class DedocPDFLoader(DedocBaseLoader): @@ -887,7 +892,7 @@ def _make_config(self) -> dict: from dedoc.utils.langchain import make_manager_pdf_config return make_manager_pdf_config( - file_path=self.file_path, + file_path=str(self.file_path), parsing_params=self.parsing_parameters, split=self.split, ) @@ -898,10 +903,10 @@ class DocumentIntelligenceLoader(BasePDFLoader): def __init__( self, - file_path: str, + file_path: Union[str, PurePath], client: Any, model: str = "prebuilt-document", - headers: Optional[Dict] = None, + headers: Optional[dict] = None, ) -> None: """ Initialize the object for file processing with Azure Document Intelligence @@ -930,10 +935,10 @@ def __init__( ... ) """ - self.parser = DocumentIntelligenceParser(client=client, model=model) super().__init__(file_path, headers=headers) + self.parser = DocumentIntelligenceParser(client=client, model=model) - def load(self) -> List[Document]: + def load(self) -> list[Document]: """Load given path as pages.""" return list(self.lazy_load()) @@ -964,7 +969,7 @@ class ZeroxPDFLoader(BasePDFLoader): def __init__( self, - file_path: Union[str, Path], + file_path: Union[str, PurePath], model: str = "gpt-4o-mini", **zerox_kwargs: Any, ) -> None: @@ -1005,7 +1010,7 @@ def lazy_load(self) -> Iterator[Document]: # Directly call asyncio.run to execute zerox synchronously zerox_output = asyncio.run( - zerox(file_path=self.file_path, model=self.model, **self.zerox_kwargs) + zerox(file_path=str(self.file_path), model=self.model, **self.zerox_kwargs) ) # Convert zerox output to Document instances and yield them diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py index 928c26898c9e7..47163c859520b 100644 --- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -61,7 +61,7 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF) if splits_by_page: - assert metadata["page"] == 0 + assert int(metadata["page"]) == 0 def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False) -> None: diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index 50c9fde29d918..66ce4ce3fa587 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -1,3 +1,4 @@ +import os from pathlib import Path from typing import Sequence, Union @@ -17,7 +18,7 @@ def test_unstructured_pdf_loader_elements_mode() -> None: """Test unstructured loader with various modes.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = UnstructuredPDFLoader(str(file_path), mode="elements") + loader = UnstructuredPDFLoader(file_path, mode="elements") docs = loader.load() assert len(docs) == 2 @@ -26,7 +27,7 @@ def test_unstructured_pdf_loader_elements_mode() -> None: def test_unstructured_pdf_loader_paged_mode() -> None: """Test unstructured loader with various modes.""" file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = UnstructuredPDFLoader(str(file_path), mode="paged") + loader = UnstructuredPDFLoader(file_path, mode="paged") docs = loader.load() assert len(docs) == 16 @@ -35,7 +36,7 @@ def test_unstructured_pdf_loader_paged_mode() -> None: def test_unstructured_pdf_loader_default_mode() -> None: """Test unstructured loader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = UnstructuredPDFLoader(str(file_path)) + loader = UnstructuredPDFLoader(file_path) docs = loader.load() assert len(docs) == 1 @@ -44,26 +45,26 @@ def test_unstructured_pdf_loader_default_mode() -> None: def test_pdfminer_loader() -> None: """Test PDFMiner loader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = PDFMinerLoader(str(file_path)) + loader = PDFMinerLoader(file_path) docs = loader.load() assert len(docs) == 1 file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PDFMinerLoader(str(file_path)) + loader = PDFMinerLoader(file_path) docs = loader.load() assert len(docs) == 1 # Verify that concatenating pages parameter works file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = PDFMinerLoader(str(file_path), concatenate_pages=True) + loader = PDFMinerLoader(file_path, concatenate_pages=True) docs = loader.load() assert len(docs) == 1 file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PDFMinerLoader(str(file_path), concatenate_pages=False) + loader = PDFMinerLoader(file_path, concatenate_pages=False) docs = loader.load() assert len(docs) == 16 @@ -72,13 +73,13 @@ def test_pdfminer_loader() -> None: def test_pdfminer_pdf_as_html_loader() -> None: """Test PDFMinerPDFasHTMLLoader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = PDFMinerPDFasHTMLLoader(str(file_path)) + loader = PDFMinerPDFasHTMLLoader(file_path) docs = loader.load() assert len(docs) == 1 file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PDFMinerPDFasHTMLLoader(str(file_path)) + loader = PDFMinerPDFasHTMLLoader(file_path) docs = loader.load() assert len(docs) == 1 @@ -87,13 +88,13 @@ def test_pdfminer_pdf_as_html_loader() -> None: def test_pypdfium2_loader() -> None: """Test PyPDFium2Loader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = PyPDFium2Loader(str(file_path)) + loader = PyPDFium2Loader(file_path) docs = loader.load() assert len(docs) == 1 file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PyPDFium2Loader(str(file_path)) + loader = PyPDFium2Loader(file_path) docs = loader.load() assert len(docs) == 16 @@ -102,13 +103,13 @@ def test_pypdfium2_loader() -> None: def test_pymupdf_loader() -> None: """Test PyMuPDF loader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = PyMuPDFLoader(str(file_path)) + loader = PyMuPDFLoader(file_path) docs = loader.load() assert len(docs) == 1 file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PyMuPDFLoader(str(file_path)) + loader = PyMuPDFLoader(file_path) docs = loader.load() assert len(docs) == 16 @@ -123,20 +124,21 @@ def test_pymupdf_loader() -> None: assert len(docs) == 1 +@pytest.mark.skipif( + not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found" +) def test_mathpix_loader() -> None: file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = MathpixPDFLoader(str(file_path)) + loader = MathpixPDFLoader(file_path) docs = loader.load() assert len(docs) == 1 - print(docs[0].page_content) # noqa: T201 file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = MathpixPDFLoader(str(file_path)) + loader = MathpixPDFLoader(file_path) docs = loader.load() assert len(docs) == 1 - print(docs[0].page_content) # noqa: T201 @pytest.mark.parametrize( @@ -187,8 +189,8 @@ def test_mathpix_loader() -> None: 1, False, ), - (str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False), - (str(Path(__file__).parent.parent / "examples/hello.pdf"), [], 1, False), + (Path(__file__).parent.parent / "examples/hello.pdf", ["FORMS"], 1, False), + (Path(__file__).parent.parent / "examples/hello.pdf", [], 1, False), ( "s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf", ["FORMS", "TABLES", "LAYOUT"], @@ -222,7 +224,7 @@ def test_amazontextract_loader( @pytest.mark.skip(reason="Requires AWS credentials to run") def test_amazontextract_loader_failures() -> None: # 2-page PDF local file system - two_page_pdf = str( + two_page_pdf = ( Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf" ) loader = AmazonTextractPDFLoader(two_page_pdf) diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index ddeaf734b0fe8..33c988d1b9294 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -43,6 +43,7 @@ "CassandraLoader", "CSVLoader", "ChatGPTLoader", + "CloudBlobLoader", "CoNLLULoader", "CollegeConfidentialLoader", "ConcurrentLoader",