A Python library for easily interacting with Skrape.ai API. Define your scraping schema using Pydantic and get type-safe results.
- 🛡️ Type-safe: Define your schemas using Pydantic and get fully typed results
- 🚀 Simple API: Just define a schema and get your data
- 🔄 Async Support: Built with async/await for efficient scraping
- 🧩 Minimal Dependencies: Built on top of proven libraries like Pydantic and httpx
- 📝 Markdown Conversion: Convert any webpage to clean markdown
- 🕷️ Web Crawling: Crawl multiple pages with browser automation
- 🔄 Background Jobs: Handle long-running tasks asynchronously
pip install skrape-py
Or with Poetry:
poetry add skrape-py
Setup your API key in .env
:
SKRAPE_API_KEY="your_api_key_here"
Get your API key on Skrape.ai
from skrape import Skrape
from pydantic import BaseModel
from typing import List
import os
import asyncio
# Define your schema using Pydantic
class ProductSchema(BaseModel):
title: str
price: float
description: str
rating: float
async def main():
async with Skrape(api_key=os.getenv("SKRAPE_API_KEY")) as skrape:
# Start extraction job
job = await skrape.extract(
"https://example.com/product",
ProductSchema,
{"renderJs": True} # Enable JavaScript rendering if needed
)
# Wait for job to complete and get results
while job.status != "COMPLETED":
job = await skrape.get_job(job.jobId)
await asyncio.sleep(1)
# Access the extracted data
product = job.result
print(f"Product: {product.title}")
print(f"Price: ${product.price}")
asyncio.run(main())
# Single URL
response = await skrape.to_markdown(
"https://example.com/article",
{"renderJs": True}
)
print(response.result) # Clean markdown content
# Multiple URLs (async)
job = await skrape.to_markdown_bulk(
["https://example.com/1", "https://example.com/2"],
{"renderJs": True}
)
# Get results when ready
while job.status != "COMPLETED":
job = await skrape.get_job(job.jobId)
await asyncio.sleep(1)
for markdown in job.result:
print(markdown)
# Start crawling job
job = await skrape.crawl(
["https://example.com", "https://example.com/page2"],
{
"renderJs": True,
"actions": [
{"scroll": {"distance": 500}}, # Scroll down 500px
{"wait_for": ".content"} # Wait for content to load
]
}
)
# Get results when ready
while job.status != "COMPLETED":
job = await skrape.get_job(job.jobId)
await asyncio.sleep(1)
for page in job.result:
print(page)
Common options for all endpoints:
options = {
"renderJs": True, # Enable JavaScript rendering
"actions": [
{"click": {"selector": ".button"}}, # Click element
{"scroll": {"distance": 500}}, # Scroll page
{"wait_for": ".content"}, # Wait for element
{"type": { # Type into input
"selector": "input",
"text": "search term"
}}
],
"callbackUrl": "https://your-server.com/webhook" # For async jobs
}
The library provides typed exceptions for better error handling:
from skrape import Skrape, SkrapeValidationError, SkrapeAPIError
async with Skrape(api_key=os.getenv("SKRAPE_API_KEY")) as skrape:
try:
response = await skrape.extract(url, schema)
except SkrapeValidationError as e:
print(f"Data doesn't match schema: {e}")
except SkrapeAPIError as e:
print(f"API error: {e}")
The API response includes rate limit information that you can use to manage your requests:
response = await skrape.to_markdown(url)
usage = response.usage
print(f"Remaining credits: {usage.remaining}")
print(f"Rate limit info:")
print(f" - Remaining: {usage.rateLimit.remaining}")
print(f" - Base limit: {usage.rateLimit.baseLimit}")
print(f" - Burst limit: {usage.rateLimit.burstLimit}")
print(f" - Reset at: {usage.rateLimit.reset}")