Skip to content

Commit 7549ec4

Browse files
committed
feat: add infinite scrolling
1 parent 6c2806d commit 7549ec4

File tree

6 files changed

+132
-2
lines changed

6 files changed

+132
-2
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import asyncio
2+
from typing import List
3+
from pydantic import BaseModel
4+
5+
from scrapegraph_py import AsyncClient
6+
from scrapegraph_py.logger import sgai_logger
7+
8+
sgai_logger.set_logging(level="INFO")
9+
10+
# Define the output schema
11+
class Company(BaseModel):
12+
name: str
13+
category: str
14+
location: str
15+
16+
class CompaniesResponse(BaseModel):
17+
companies: List[Company]
18+
19+
async def scrape_companies(client: AsyncClient, url: str, batch: str) -> None:
20+
"""Scrape companies from a specific YC batch with infinite scroll."""
21+
try:
22+
response = await client.smartscraper(
23+
website_url=f"{url}?batch={batch}",
24+
user_prompt="Extract all company names and their categories from the page",
25+
output_schema=CompaniesResponse,
26+
number_of_scrolls=10 # Scroll 10 times to load more companies
27+
)
28+
29+
# Parse and print the results
30+
result = CompaniesResponse.model_validate(response['result'])
31+
print(f"\nCompanies from {batch} batch:")
32+
print("=" * 80)
33+
for company in result.companies:
34+
print(f"Name: {company.name}")
35+
print(f"Category: {company.category}")
36+
print(f"Location: {company.location}")
37+
print("-" * 80)
38+
39+
except Exception as e:
40+
print(f"Error scraping {batch} batch: {e}")
41+
42+
async def main():
43+
# Initialize async client
44+
sgai_client = AsyncClient(api_key="your-api-key-here")
45+
46+
try:
47+
# Define batches to scrape
48+
base_url = "https://www.ycombinator.com/companies"
49+
batches = [
50+
"Spring%202025",
51+
"Winter%202025",
52+
"Summer%202024"
53+
]
54+
55+
# Create tasks for each batch
56+
tasks = [
57+
scrape_companies(sgai_client, base_url, batch)
58+
for batch in batches
59+
]
60+
61+
# Execute all scraping tasks concurrently
62+
await asyncio.gather(*tasks)
63+
64+
except Exception as e:
65+
print(f"An error occurred: {e}")
66+
67+
finally:
68+
await sgai_client.close()
69+
70+
if __name__ == "__main__":
71+
asyncio.run(main())
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from scrapegraph_py import Client
2+
from scrapegraph_py.logger import sgai_logger
3+
from pydantic import BaseModel
4+
from typing import List
5+
6+
sgai_logger.set_logging(level="INFO")
7+
8+
# Define the output schema
9+
class Company(BaseModel):
10+
name: str
11+
category: str
12+
location: str
13+
14+
class CompaniesResponse(BaseModel):
15+
companies: List[Company]
16+
17+
# Initialize the client with explicit API key
18+
sgai_client = Client(api_key="sgai-4cf4a4f5-87f7-457a-8c58-0790ecaf323e")
19+
20+
try:
21+
# SmartScraper request with infinite scroll
22+
response = sgai_client.smartscraper(
23+
website_url="https://www.ycombinator.com/companies?batch=Spring%202025",
24+
user_prompt="Extract all company names and their categories from the page",
25+
output_schema=CompaniesResponse,
26+
number_of_scrolls=10 # Scroll 10 times to load more companies
27+
)
28+
29+
# Print the response
30+
print(f"Request ID: {response['request_id']}")
31+
32+
# Parse and print the results in a structured way
33+
result = CompaniesResponse.model_validate(response['result'])
34+
print("\nExtracted Companies:")
35+
print("-" * 80)
36+
for company in result.companies:
37+
print(f"Name: {company.name}")
38+
print(f"Category: {company.category}")
39+
print(f"Location: {company.location}")
40+
print("-" * 80)
41+
42+
except Exception as e:
43+
print(f"An error occurred: {e}")
44+
45+
finally:
46+
sgai_client.close()

scrapegraph-py/scrapegraph_py/async_client.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ async def smartscraper(
174174
website_html: Optional[str] = None,
175175
headers: Optional[dict[str, str]] = None,
176176
output_schema: Optional[BaseModel] = None,
177+
number_of_scrolls: Optional[int] = None,
177178
):
178179
"""Send a smartscraper request"""
179180
logger.info("🔍 Starting smartscraper request")
@@ -183,6 +184,8 @@ async def smartscraper(
183184
logger.debug("📄 Using provided HTML content")
184185
if headers:
185186
logger.debug("🔧 Using custom headers")
187+
if number_of_scrolls is not None:
188+
logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}")
186189
logger.debug(f"📝 Prompt: {user_prompt}")
187190

188191
request = SmartScraperRequest(
@@ -191,6 +194,7 @@ async def smartscraper(
191194
headers=headers,
192195
user_prompt=user_prompt,
193196
output_schema=output_schema,
197+
number_of_scrolls=number_of_scrolls,
194198
)
195199
logger.debug("✅ Request validation passed")
196200

scrapegraph-py/scrapegraph_py/client.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ def smartscraper(
182182
website_html: Optional[str] = None,
183183
headers: Optional[dict[str, str]] = None,
184184
output_schema: Optional[BaseModel] = None,
185+
number_of_scrolls: Optional[int] = None,
185186
):
186187
"""Send a smartscraper request"""
187188
logger.info("🔍 Starting smartscraper request")
@@ -191,6 +192,8 @@ def smartscraper(
191192
logger.debug("📄 Using provided HTML content")
192193
if headers:
193194
logger.debug("🔧 Using custom headers")
195+
if number_of_scrolls is not None:
196+
logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}")
194197
logger.debug(f"📝 Prompt: {user_prompt}")
195198

196199
request = SmartScraperRequest(
@@ -199,6 +202,7 @@ def smartscraper(
199202
headers=headers,
200203
user_prompt=user_prompt,
201204
output_schema=output_schema,
205+
number_of_scrolls=number_of_scrolls,
202206
)
203207
logger.debug("✅ Request validation passed")
204208

scrapegraph-py/scrapegraph_py/models/smartscraper.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from uuid import UUID
55

66
from bs4 import BeautifulSoup
7-
from pydantic import BaseModel, Field, model_validator
7+
from pydantic import BaseModel, Field, model_validator, conint
88

99

1010
class SmartScraperRequest(BaseModel):
@@ -29,6 +29,11 @@ class SmartScraperRequest(BaseModel):
2929
description="Optional headers to send with the request, including cookies and user agent",
3030
)
3131
output_schema: Optional[Type[BaseModel]] = None
32+
number_of_scrolls: Optional[conint(ge=0, le=100)] = Field(
33+
default=None,
34+
description="Number of times to scroll the page (0-100). If None, no scrolling will be performed.",
35+
example=10
36+
)
3237

3338
@model_validator(mode="after")
3439
def validate_user_prompt(self) -> "SmartScraperRequest":

scrapegraph-py/uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)