Skip to content

Commit 56c3b2d

Browse files
committed
Update async_smartscraper_infinite_scroll_example.py
1 parent 9c60b86 commit 56c3b2d

File tree

1 file changed

+43
-45
lines changed

1 file changed

+43
-45
lines changed
Lines changed: 43 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,71 +1,69 @@
11
import asyncio
2-
from typing import List
3-
from pydantic import BaseModel
2+
from typing import List, Dict, Any
43

54
from scrapegraph_py import AsyncClient
65
from scrapegraph_py.logger import sgai_logger
76

87
sgai_logger.set_logging(level="INFO")
98

10-
# Define the output schema
11-
class Company(BaseModel):
12-
name: str
13-
category: str
14-
location: str
15-
16-
class CompaniesResponse(BaseModel):
17-
companies: List[Company]
189

1910
async def scrape_companies(client: AsyncClient, url: str, batch: str) -> None:
2011
"""Scrape companies from a specific YC batch with infinite scroll."""
2112
try:
13+
# Initial scrape with infinite scroll enabled
2214
response = await client.smartscraper(
23-
website_url=f"{url}?batch={batch}",
24-
user_prompt="Extract all company names and their categories from the page",
25-
output_schema=CompaniesResponse,
26-
number_of_scrolls=10 # Scroll 10 times to load more companies
15+
website_url=url,
16+
user_prompt="Extract all company information from this page, including name, description, and website",
17+
infinite_scroll=True,
18+
scroll_options={
19+
"max_scrolls": 10, # Adjust based on page size
20+
"scroll_delay": 2, # Seconds between scrolls
21+
"scroll_to_bottom": True
22+
}
2723
)
28-
29-
# Parse and print the results
30-
result = CompaniesResponse.model_validate(response['result'])
31-
print(f"\nCompanies from {batch} batch:")
32-
print("=" * 80)
33-
for company in result.companies:
34-
print(f"Name: {company.name}")
35-
print(f"Category: {company.category}")
36-
print(f"Location: {company.location}")
37-
print("-" * 80)
38-
24+
25+
# Process the results
26+
companies = response.get("result", [])
27+
if not companies:
28+
print(f"No companies found for batch {batch}")
29+
return
30+
31+
# Save or process the companies data
32+
print(f"Found {len(companies)} companies in batch {batch}")
33+
for company in companies:
34+
print(f"Company: {company.get('name', 'N/A')}")
35+
print(f"Description: {company.get('description', 'N/A')}")
36+
print(f"Website: {company.get('website', 'N/A')}")
37+
print("-" * 50)
38+
3939
except Exception as e:
40-
print(f"Error scraping {batch} batch: {e}")
40+
print(f"Error scraping batch {batch}: {str(e)}")
41+
4142

4243
async def main():
4344
# Initialize async client
44-
sgai_client = AsyncClient(api_key="your-api-key-here")
45-
45+
client = AsyncClient(api_key="your-api-key-here")
46+
4647
try:
47-
# Define batches to scrape
48-
base_url = "https://www.ycombinator.com/companies"
49-
batches = [
50-
"Spring%202025",
51-
"Winter%202025",
52-
"Summer%202024"
53-
]
54-
48+
# Example YC batch URLs
49+
batch_urls = {
50+
"W24": "https://www.ycombinator.com/companies?batch=W24",
51+
"S23": "https://www.ycombinator.com/companies?batch=S23"
52+
}
53+
5554
# Create tasks for each batch
5655
tasks = [
57-
scrape_companies(sgai_client, base_url, batch)
58-
for batch in batches
56+
scrape_companies(client, url, batch)
57+
for batch, url in batch_urls.items()
5958
]
60-
61-
# Execute all scraping tasks concurrently
59+
60+
# Execute all batch scraping concurrently
6261
await asyncio.gather(*tasks)
63-
64-
except Exception as e:
65-
print(f"An error occurred: {e}")
66-
62+
6763
finally:
68-
await sgai_client.close()
64+
# Ensure client is properly closed
65+
await client.close()
66+
6967

7068
if __name__ == "__main__":
7169
asyncio.run(main())

0 commit comments

Comments
 (0)