ScrapeGraphAI · VinciGit00 · Feb 26, 2025 · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025
diff --git a/codebeaver.yml b/codebeaver.yml
@@ -0,0 +1,7 @@
+workspaces:
+- from: jest
+  name: scrapegraph-js
+  path: scrapegraph-js
+- from: pytest
+  name: scrapegraph-py
+  path: scrapegraph-py
diff --git a/scrapegraph-py/tests/test_localscraper.py b/scrapegraph-py/tests/test_localscraper.py
@@ -0,0 +1,90 @@
+import pytest
+from pydantic import BaseModel
+from scrapegraph_py.models.localscraper import LocalScraperRequest, GetLocalScraperRequest
+
+# Create a dummy output schema to test the conversion in model_dump.
+class DummySchema(BaseModel):
+    test_field: str
+
+def test_output_schema_conversion():
+    """
+    Test that when an output_schema is provided in a LocalScraperRequest,
+    model_dump returns a dictionary where the output_schema key holds the JSON schema
+    of the provided Pydantic model.
+    """
+    user_prompt = "Extract company details"
+    website_html = "<html><body><div>Content</div></body></html>"
+    # Create a LocalScraperRequest with a dummy output_schema.
+    request = LocalScraperRequest(user_prompt=user_prompt, website_html=website_html, output_schema=DummySchema)
+    dumped = request.model_dump()
+    # Verify that output_schema is converted properly in the dumped dictionary.
+    assert "output_schema" in dumped
+    assert dumped["output_schema"] == DummySchema.model_json_schema()
+
+def test_invalid_website_html_structure():
+    """
+    Test that LocalScraperRequest raises a ValueError when the website_html provided
+    has no parseable HTML tags. This ensures the HTML content validation catches 
+    non-HTML input.
+    """
+    # This string has no HTML tags so BeautifulSoup.find() should return None.
+    invalid_html = "Just some random text"
+    with pytest.raises(ValueError, match="Invalid HTML - no parseable content found"):
+        LocalScraperRequest(user_prompt="Extract info about the company", website_html=invalid_html)
+
+def test_invalid_user_prompt_non_alnum():
+    """
+    Test that LocalScraperRequest raises a ValueError when the user_prompt 
+    does not contain any alphanumeric characters.
+    """
+    with pytest.raises(ValueError, match="User prompt must contain a valid prompt"):
+        LocalScraperRequest(
+            user_prompt="!!!",
+            website_html="<html><body><div>Valid Content</div></body></html>"
+        )
+
+def test_get_localscraper_request_invalid_uuid():
+    """
+    Test that GetLocalScraperRequest raises a ValueError when an invalid UUID is provided.
+    This ensures that the model correctly validates the request_id as a proper UUID.
+    """
+    invalid_uuid = "not-a-valid-uuid"
+    with pytest.raises(ValueError, match="request_id must be a valid UUID"):
+        GetLocalScraperRequest(request_id=invalid_uuid)
+
+def test_website_html_exceeds_maximum_size():
+    """
+    Test that LocalScraperRequest raises a ValueError when the website_html content
+    exceeds the maximum allowed size of 2MB. The generated HTML is valid but too large.
+    """
+    # Calculate the number of characters needed to exceed 2MB when encoded in UTF-8.
+    max_size_bytes = 2 * 1024 * 1024
+    # Create a valid HTML string that exceeds 2MB.
+    base_html_prefix = "<html><body>"
+    base_html_suffix = "</body></html>"
+    repeated_char_length = max_size_bytes - len(base_html_prefix.encode("utf-8")) - len(base_html_suffix.encode("utf-8")) + 1
+    oversized_content = "a" * repeated_char_length
+    oversized_html = f"{base_html_prefix}{oversized_content}{base_html_suffix}"
+
+    with pytest.raises(ValueError, match="Website HTML content exceeds maximum size of 2MB"):
+        LocalScraperRequest(user_prompt="Extract info", website_html=oversized_html)
+
+def test_website_html_exactly_maximum_size():
+    """
+    Test that LocalScraperRequest accepts website_html content exactly 2MB in size.
+    This ensures that the size validation correctly allows content on the boundary.
+    """
+    user_prompt = "Extract info with exact size HTML"
+    prefix = "<html><body>"
+    suffix = "</body></html>"
+    # Calculate the length of the content needed to exactly reach 2MB when combined with prefix and suffix.
+    max_size_bytes = 2 * 1024 * 1024
+    content_length = max_size_bytes - len(prefix.encode("utf-8")) - len(suffix.encode("utf-8"))
+    valid_content = "a" * content_length
+    html = prefix + valid_content + suffix
+
+    # Attempt to create a valid LocalScraperRequest.
+    request = LocalScraperRequest(user_prompt=user_prompt, website_html=html)
+
+    # Verify that the HTML content is exactly 2MB in size when encoded in UTF-8.
+    assert len(request.website_html.encode("utf-8")) == max_size_bytes
diff --git a/scrapegraph-py/tests/test_markdownify.py b/scrapegraph-py/tests/test_markdownify.py
@@ -0,0 +1,59 @@
+import pytest
+from scrapegraph_py.models.markdownify import MarkdownifyRequest, GetMarkdownifyRequest
+
+def test_markdownify_request_invalid_url_scheme():
+    """
+    Test that MarkdownifyRequest raises a ValueError when the website_url does not
+    start with either 'http://' or 'https://'.
+    """
+    with pytest.raises(ValueError, match="Invalid URL"):
+        MarkdownifyRequest(website_url="ftp://example.com")
+
+def test_markdownify_request_empty_url():
+    """
+    Test that MarkdownifyRequest raises a ValueError when the website_url is empty or contains only whitespace.
+    """
+    with pytest.raises(ValueError, match="Website URL cannot be empty"):
+        MarkdownifyRequest(website_url="   ")
+
+def test_markdownify_request_valid_url():
+    """
+    Test that MarkdownifyRequest properly creates an instance when provided with a valid URL.
+    This covers the scenario where the input URL meets all validation requirements.
+    """
+    valid_url = "https://example.com"
+    req = MarkdownifyRequest(website_url=valid_url)
+    assert req.website_url == valid_url
+
+def test_markdownify_request_untrimmed_url():
+    """
+    Test that MarkdownifyRequest raises a ValueError when the website_url contains leading or trailing whitespace.
+    Although the stripped URL would be valid, the actual value is not processed further, causing the check
+    for the proper URL scheme to fail.
+    """
+    # The URL has leading whitespace, so it does not start directly with "https://"
+    with pytest.raises(ValueError, match="Invalid URL"):
+        MarkdownifyRequest(website_url="   https://example.com")
+
+def test_get_markdownify_request_invalid_uuid():
+    """
+    Test that GetMarkdownifyRequest raises a ValueError when the request_id is not a valid UUID.
+    """
+    with pytest.raises(ValueError, match="request_id must be a valid UUID"):
+        GetMarkdownifyRequest(request_id="invalid_uuid")
+
+def test_get_markdownify_request_valid_uuid():
+    """
+    Test that GetMarkdownifyRequest properly creates an instance when provided with a valid UUID.
+    """
+    valid_uuid = "123e4567-e89b-12d3-a456-426614174000"
+    req = GetMarkdownifyRequest(request_id=valid_uuid)
+    assert req.request_id == valid_uuid
+
+def test_get_markdownify_request_untrimmed_uuid():
+    """
+    Test that GetMarkdownifyRequest raises a ValueError when the request_id
+    contains leading or trailing whitespace, despite the trimmed UUID being valid.
+    """
+    with pytest.raises(ValueError, match="request_id must be a valid UUID"):
+        GetMarkdownifyRequest(request_id=" 123e4567-e89b-12d3-a456-426614174000 ")
diff --git a/scrapegraph-py/tests/test_smartscraper.py b/scrapegraph-py/tests/test_smartscraper.py
@@ -0,0 +1,78 @@
+import pytest
+from pydantic import BaseModel, ValidationError
+from scrapegraph_py.models.smartscraper import SmartScraperRequest, GetSmartScraperRequest
+
+# Define a dummy schema to test the output_schema conversion in model_dump
+class DummySchema(BaseModel):
+    """A dummy schema to simulate a Pydantic model with JSON schema conversion."""
+    a: int = 1
+
+def test_model_dump_with_output_schema_conversion():
+    """
+    Test that model_dump on SmartScraperRequest converts the provided output_schema into a JSON schema dict.
+    """
+    # Create a request with a valid user prompt, website URL, and a dummy output_schema.
+    request = SmartScraperRequest(
+        user_prompt="Extract information about the company",
+        website_url="https://scrapegraphai.com/",
+        output_schema=DummySchema
+    )
+    # Get the dump dict from the model.
+    output = request.model_dump()
+    # The model_dump should include the 'output_schema' converted to its JSON schema representation.
+    expected_schema = DummySchema.model_json_schema()
+    assert output.get("output_schema") == expected_schema
+
+def test_model_dump_without_output_schema():
+    """
+    Test that model_dump on SmartScraperRequest returns output_schema as None 
+    when no output_schema is provided. This ensures that the conversion logic is only 
+    applied when output_schema is not None.
+    """
+    # Create a valid SmartScraperRequest without providing an output_schema.
+    request = SmartScraperRequest(
+        user_prompt="Extract some meaningful data",
+        website_url="https://scrapegraphai.com/"
+    )
+    # Get the dumped dictionary from the model.
+    output = request.model_dump()
+    # Ensure that the output contains the key "output_schema" and its value is None.
+    assert "output_schema" in output, "Output schema key should be present even if None"
+    assert output["output_schema"] is None, "Output schema should be None when not provided"
+
+def test_invalid_get_smartscraper_request_id():
+    """
+    Test that GetSmartScraperRequest raises a ValueError when provided with an invalid UUID.
+    This test ensures that the request_id field is validated correctly.
+    """
+    with pytest.raises(ValueError, match="request_id must be a valid UUID"):
+        GetSmartScraperRequest(request_id="invalid-uuid")
+
+def test_invalid_url_in_smartscraper_request():
+    """
+    Test that SmartScraperRequest raises a ValueError when provided with a website_url
+    that does not start with 'http://' or 'https://'. This ensures the URL validation works.
+    """
+    with pytest.raises(ValueError, match="Invalid URL"):
+        SmartScraperRequest(
+            user_prompt="Extract data",
+            website_url="ftp://invalid-url"
+        )
+
+def test_invalid_user_prompt_empty_and_non_alnum():
+    """
+    Test that SmartScraperRequest raises a ValueError when the user_prompt is either empty (or only whitespace)
+    or when it contains no alphanumeric characters. This ensures the user prompt validator is working correctly.
+    """
+    # Test with a user_prompt that is empty (only whitespace)
+    with pytest.raises(ValueError, match="User prompt cannot be empty"):
+        SmartScraperRequest(
+            user_prompt="   ",
+            website_url="https://scrapegraphai.com/"
+        )
+    # Test with a user_prompt that contains no alphanumeric characters
+    with pytest.raises(ValueError, match="User prompt must contain a valid prompt"):
+        SmartScraperRequest(
+            user_prompt="!!!",
+            website_url="https://scrapegraphai.com/"
+        )