Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions deploy/docker/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,8 @@ async def handle_markdown_request(
config: Optional[dict] = None,
provider: Optional[str] = None,
temperature: Optional[float] = None,
base_url: Optional[str] = None
base_url: Optional[str] = None,
crawler_config: Optional[dict] = None,
) -> str:
"""Handle markdown generation requests."""
crawler = None
Expand Down Expand Up @@ -313,14 +314,13 @@ async def handle_markdown_request(
**_cfg["crawler"]["browser"].get("kwargs", {}),
)
crawler = await get_crawler(browser_cfg)
result = await crawler.arun(
url=decoded_url,
config=CrawlerRunConfig(
markdown_generator=md_generator,
scraping_strategy=LXMLWebScrapingStrategy(),
cache_mode=cache_mode
)
)
cc = crawler_config or {}
cfg = CrawlerRunConfig.load(cc)
cfg.markdown_generator = md_generator
cfg.scraping_strategy = LXMLWebScrapingStrategy()
if 'cache_mode' not in cc:
cfg.cache_mode = cache_mode
result = await crawler.arun(url=decoded_url, config=cfg)

if not result.success:
raise HTTPException(
Expand Down
52 changes: 47 additions & 5 deletions deploy/docker/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,25 @@

class CrawlRequest(BaseModel):
urls: List[str] = Field(min_length=1, max_length=100)
browser_config: Optional[Dict] = Field(default_factory=dict)
crawler_config: Optional[Dict] = Field(default_factory=dict)
browser_config: Optional[Dict] = Field(
default_factory=dict,
description="Optional BrowserConfig overrides (e.g. headless, user_agent, proxy, viewport)"
)
crawler_config: Optional[Dict] = Field(
default_factory=dict,
description=(
"Optional CrawlerRunConfig overrides. Key parameters: "
"wait_until ('load', 'domcontentloaded', 'networkidle', 'commit') — when to consider navigation done; "
"delay_before_return_html (float, seconds) — extra wait before capturing HTML, useful for SPAs; "
"cache_mode ('enabled', 'disabled', 'read_only', 'write_only', 'bypass') — cache behaviour; "
"js_code (str | list) — JavaScript to execute after page load; "
"wait_for (str) — CSS selector or JS expression to wait for before returning; "
"screenshot (bool) — capture a screenshot; pdf (bool) — generate a PDF; "
"extraction_strategy (dict) — structured extraction config; "
"markdown_generator (dict) — markdown generation config. "
"All CrawlerRunConfig fields are accepted; unknown keys are silently ignored."
)
)
crawler_configs: Optional[List[Dict]] = Field(
default=None,
description=(
Expand Down Expand Up @@ -70,23 +87,43 @@ class MarkdownRequest(BaseModel):
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
temperature: Optional[float] = Field(None, description="LLM temperature override (0.0-2.0)")
base_url: Optional[str] = Field(None, description="LLM API base URL override")
crawler_config: Optional[Dict] = Field(
default_factory=dict,
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode). "
"Takes precedence over the 'c' cache parameter when cache_mode is specified here. "
"scraping_strategy is always set to LXMLWebScrapingStrategy by this endpoint and cannot be overridden."
)


class RawCode(BaseModel):
code: str

class HTMLRequest(BaseModel):
url: str

crawler_config: Optional[Dict] = Field(
default_factory=dict,
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode)"
)

class ScreenshotRequest(BaseModel):
url: str
screenshot_wait_for: Optional[float] = 2
wait_for_images: Optional[bool] = False
screenshot_wait_for: Optional[float] = None
wait_for_images: Optional[bool] = None
output_path: Optional[str] = None
crawler_config: Optional[Dict] = Field(
default_factory=dict,
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode). "
"screenshot=True is always enforced."
)

class PDFRequest(BaseModel):
url: str
output_path: Optional[str] = None
crawler_config: Optional[Dict] = Field(
default_factory=dict,
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode). "
"pdf=True is always enforced."
)


class JSEndpointRequest(BaseModel):
Expand All @@ -95,6 +132,11 @@ class JSEndpointRequest(BaseModel):
...,
description="List of separated JavaScript snippets to execute"
)
crawler_config: Optional[Dict] = Field(
default_factory=dict,
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode). "
"js_code is always set from the scripts parameter and cannot be overridden via crawler_config."
)


class WebhookConfig(BaseModel):
Expand Down
18 changes: 13 additions & 5 deletions deploy/docker/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,8 @@ async def get_markdown(
400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
markdown = await handle_markdown_request(
body.url, body.f, body.q, body.c, config, body.provider,
body.temperature, body.base_url
body.temperature, body.base_url,
crawler_config=body.crawler_config
)
return JSONResponse({
"url": body.url,
Expand All @@ -459,9 +460,9 @@ async def generate_html(
Use when you need sanitized HTML structures for building schemas or further processing.
"""
validate_url_scheme(body.url, allow_raw=True)
cfg = CrawlerRunConfig()
crawler = None
try:
cfg = CrawlerRunConfig.load(body.crawler_config or {})
crawler = await get_crawler(get_default_browser_config())
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
Expand Down Expand Up @@ -496,7 +497,12 @@ async def generate_screenshot(
validate_url_scheme(body.url)
crawler = None
try:
cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for, wait_for_images=body.wait_for_images)
cfg = CrawlerRunConfig.load(body.crawler_config or {})
cfg.screenshot = True
if body.screenshot_wait_for is not None:
cfg.screenshot_wait_for = body.screenshot_wait_for
if body.wait_for_images is not None:
cfg.wait_for_images = body.wait_for_images
crawler = await get_crawler(get_default_browser_config())
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
Expand Down Expand Up @@ -534,7 +540,8 @@ async def generate_pdf(
validate_url_scheme(body.url)
crawler = None
try:
cfg = CrawlerRunConfig(pdf=True)
cfg = CrawlerRunConfig.load(body.crawler_config or {})
cfg.pdf = True
crawler = await get_crawler(get_default_browser_config())
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
Expand Down Expand Up @@ -610,7 +617,8 @@ class MarkdownGenerationResult(BaseModel):
validate_url_scheme(body.url)
crawler = None
try:
cfg = CrawlerRunConfig(js_code=body.scripts)
cfg = CrawlerRunConfig.load(body.crawler_config or {})
cfg.js_code = body.scripts
crawler = await get_crawler(get_default_browser_config())
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
Expand Down
Loading