Skip to content

Codebeaver/crewai 29 #34

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions codebeaver.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
workspaces:
- from: jest
name: scrapegraph-js
path: scrapegraph-js
- from: pytest
name: scrapegraph-py
path: scrapegraph-py
90 changes: 90 additions & 0 deletions scrapegraph-py/tests/test_localscraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import pytest
from pydantic import BaseModel
from scrapegraph_py.models.localscraper import LocalScraperRequest, GetLocalScraperRequest

# Create a dummy output schema to test the conversion in model_dump.
class DummySchema(BaseModel):
test_field: str

def test_output_schema_conversion():
"""
Test that when an output_schema is provided in a LocalScraperRequest,
model_dump returns a dictionary where the output_schema key holds the JSON schema
of the provided Pydantic model.
"""
user_prompt = "Extract company details"
website_html = "<html><body><div>Content</div></body></html>"
# Create a LocalScraperRequest with a dummy output_schema.
request = LocalScraperRequest(user_prompt=user_prompt, website_html=website_html, output_schema=DummySchema)
dumped = request.model_dump()
# Verify that output_schema is converted properly in the dumped dictionary.
assert "output_schema" in dumped
assert dumped["output_schema"] == DummySchema.model_json_schema()

def test_invalid_website_html_structure():
"""
Test that LocalScraperRequest raises a ValueError when the website_html provided
has no parseable HTML tags. This ensures the HTML content validation catches
non-HTML input.
"""
# This string has no HTML tags so BeautifulSoup.find() should return None.
invalid_html = "Just some random text"
with pytest.raises(ValueError, match="Invalid HTML - no parseable content found"):
LocalScraperRequest(user_prompt="Extract info about the company", website_html=invalid_html)

def test_invalid_user_prompt_non_alnum():
"""
Test that LocalScraperRequest raises a ValueError when the user_prompt
does not contain any alphanumeric characters.
"""
with pytest.raises(ValueError, match="User prompt must contain a valid prompt"):
LocalScraperRequest(
user_prompt="!!!",
website_html="<html><body><div>Valid Content</div></body></html>"
)

def test_get_localscraper_request_invalid_uuid():
"""
Test that GetLocalScraperRequest raises a ValueError when an invalid UUID is provided.
This ensures that the model correctly validates the request_id as a proper UUID.
"""
invalid_uuid = "not-a-valid-uuid"
with pytest.raises(ValueError, match="request_id must be a valid UUID"):
GetLocalScraperRequest(request_id=invalid_uuid)

def test_website_html_exceeds_maximum_size():
"""
Test that LocalScraperRequest raises a ValueError when the website_html content
exceeds the maximum allowed size of 2MB. The generated HTML is valid but too large.
"""
# Calculate the number of characters needed to exceed 2MB when encoded in UTF-8.
max_size_bytes = 2 * 1024 * 1024
# Create a valid HTML string that exceeds 2MB.
base_html_prefix = "<html><body>"
base_html_suffix = "</body></html>"
repeated_char_length = max_size_bytes - len(base_html_prefix.encode("utf-8")) - len(base_html_suffix.encode("utf-8")) + 1
oversized_content = "a" * repeated_char_length
oversized_html = f"{base_html_prefix}{oversized_content}{base_html_suffix}"

with pytest.raises(ValueError, match="Website HTML content exceeds maximum size of 2MB"):
LocalScraperRequest(user_prompt="Extract info", website_html=oversized_html)

def test_website_html_exactly_maximum_size():
"""
Test that LocalScraperRequest accepts website_html content exactly 2MB in size.
This ensures that the size validation correctly allows content on the boundary.
"""
user_prompt = "Extract info with exact size HTML"
prefix = "<html><body>"
suffix = "</body></html>"
# Calculate the length of the content needed to exactly reach 2MB when combined with prefix and suffix.
max_size_bytes = 2 * 1024 * 1024
content_length = max_size_bytes - len(prefix.encode("utf-8")) - len(suffix.encode("utf-8"))
valid_content = "a" * content_length
html = prefix + valid_content + suffix

# Attempt to create a valid LocalScraperRequest.
request = LocalScraperRequest(user_prompt=user_prompt, website_html=html)

# Verify that the HTML content is exactly 2MB in size when encoded in UTF-8.
assert len(request.website_html.encode("utf-8")) == max_size_bytes
59 changes: 59 additions & 0 deletions scrapegraph-py/tests/test_markdownify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pytest
from scrapegraph_py.models.markdownify import MarkdownifyRequest, GetMarkdownifyRequest

def test_markdownify_request_invalid_url_scheme():
"""
Test that MarkdownifyRequest raises a ValueError when the website_url does not
start with either 'http://' or 'https://'.
"""
with pytest.raises(ValueError, match="Invalid URL"):
MarkdownifyRequest(website_url="ftp://example.com")

def test_markdownify_request_empty_url():
"""
Test that MarkdownifyRequest raises a ValueError when the website_url is empty or contains only whitespace.
"""
with pytest.raises(ValueError, match="Website URL cannot be empty"):
MarkdownifyRequest(website_url=" ")

def test_markdownify_request_valid_url():
"""
Test that MarkdownifyRequest properly creates an instance when provided with a valid URL.
This covers the scenario where the input URL meets all validation requirements.
"""
valid_url = "https://example.com"
req = MarkdownifyRequest(website_url=valid_url)
assert req.website_url == valid_url

def test_markdownify_request_untrimmed_url():
"""
Test that MarkdownifyRequest raises a ValueError when the website_url contains leading or trailing whitespace.
Although the stripped URL would be valid, the actual value is not processed further, causing the check
for the proper URL scheme to fail.
"""
# The URL has leading whitespace, so it does not start directly with "https://"
with pytest.raises(ValueError, match="Invalid URL"):
MarkdownifyRequest(website_url=" https://example.com")

def test_get_markdownify_request_invalid_uuid():
"""
Test that GetMarkdownifyRequest raises a ValueError when the request_id is not a valid UUID.
"""
with pytest.raises(ValueError, match="request_id must be a valid UUID"):
GetMarkdownifyRequest(request_id="invalid_uuid")

def test_get_markdownify_request_valid_uuid():
"""
Test that GetMarkdownifyRequest properly creates an instance when provided with a valid UUID.
"""
valid_uuid = "123e4567-e89b-12d3-a456-426614174000"
req = GetMarkdownifyRequest(request_id=valid_uuid)
assert req.request_id == valid_uuid

def test_get_markdownify_request_untrimmed_uuid():
"""
Test that GetMarkdownifyRequest raises a ValueError when the request_id
contains leading or trailing whitespace, despite the trimmed UUID being valid.
"""
with pytest.raises(ValueError, match="request_id must be a valid UUID"):
GetMarkdownifyRequest(request_id=" 123e4567-e89b-12d3-a456-426614174000 ")
78 changes: 78 additions & 0 deletions scrapegraph-py/tests/test_smartscraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pytest
from pydantic import BaseModel, ValidationError
from scrapegraph_py.models.smartscraper import SmartScraperRequest, GetSmartScraperRequest

# Define a dummy schema to test the output_schema conversion in model_dump
class DummySchema(BaseModel):
"""A dummy schema to simulate a Pydantic model with JSON schema conversion."""
a: int = 1

def test_model_dump_with_output_schema_conversion():
"""
Test that model_dump on SmartScraperRequest converts the provided output_schema into a JSON schema dict.
"""
# Create a request with a valid user prompt, website URL, and a dummy output_schema.
request = SmartScraperRequest(
user_prompt="Extract information about the company",
website_url="https://scrapegraphai.com/",
output_schema=DummySchema
)
# Get the dump dict from the model.
output = request.model_dump()
# The model_dump should include the 'output_schema' converted to its JSON schema representation.
expected_schema = DummySchema.model_json_schema()
assert output.get("output_schema") == expected_schema

def test_model_dump_without_output_schema():
"""
Test that model_dump on SmartScraperRequest returns output_schema as None
when no output_schema is provided. This ensures that the conversion logic is only
applied when output_schema is not None.
"""
# Create a valid SmartScraperRequest without providing an output_schema.
request = SmartScraperRequest(
user_prompt="Extract some meaningful data",
website_url="https://scrapegraphai.com/"
)
# Get the dumped dictionary from the model.
output = request.model_dump()
# Ensure that the output contains the key "output_schema" and its value is None.
assert "output_schema" in output, "Output schema key should be present even if None"
assert output["output_schema"] is None, "Output schema should be None when not provided"

def test_invalid_get_smartscraper_request_id():
"""
Test that GetSmartScraperRequest raises a ValueError when provided with an invalid UUID.
This test ensures that the request_id field is validated correctly.
"""
with pytest.raises(ValueError, match="request_id must be a valid UUID"):
GetSmartScraperRequest(request_id="invalid-uuid")

def test_invalid_url_in_smartscraper_request():
"""
Test that SmartScraperRequest raises a ValueError when provided with a website_url
that does not start with 'http://' or 'https://'. This ensures the URL validation works.
"""
with pytest.raises(ValueError, match="Invalid URL"):
SmartScraperRequest(
user_prompt="Extract data",
website_url="ftp://invalid-url"
)

def test_invalid_user_prompt_empty_and_non_alnum():
"""
Test that SmartScraperRequest raises a ValueError when the user_prompt is either empty (or only whitespace)
or when it contains no alphanumeric characters. This ensures the user prompt validator is working correctly.
"""
# Test with a user_prompt that is empty (only whitespace)
with pytest.raises(ValueError, match="User prompt cannot be empty"):
SmartScraperRequest(
user_prompt=" ",
website_url="https://scrapegraphai.com/"
)
# Test with a user_prompt that contains no alphanumeric characters
with pytest.raises(ValueError, match="User prompt must contain a valid prompt"):
SmartScraperRequest(
user_prompt="!!!",
website_url="https://scrapegraphai.com/"
)
Loading