1
1
import asyncio
2
- from typing import List
3
- from pydantic import BaseModel
2
+ from typing import List , Dict , Any
4
3
5
4
from scrapegraph_py import AsyncClient
6
5
from scrapegraph_py .logger import sgai_logger
7
6
8
7
sgai_logger .set_logging (level = "INFO" )
9
8
10
- # Define the output schema
11
- class Company (BaseModel ):
12
- name : str
13
- category : str
14
- location : str
15
-
16
- class CompaniesResponse (BaseModel ):
17
- companies : List [Company ]
18
9
19
10
async def scrape_companies (client : AsyncClient , url : str , batch : str ) -> None :
20
11
"""Scrape companies from a specific YC batch with infinite scroll."""
21
12
try :
13
+ # Initial scrape with infinite scroll enabled
22
14
response = await client .smartscraper (
23
- website_url = f"{ url } ?batch={ batch } " ,
24
- user_prompt = "Extract all company names and their categories from the page" ,
25
- output_schema = CompaniesResponse ,
26
- number_of_scrolls = 10 # Scroll 10 times to load more companies
15
+ website_url = url ,
16
+ user_prompt = "Extract all company information from this page, including name, description, and website" ,
17
+ infinite_scroll = True ,
18
+ scroll_options = {
19
+ "max_scrolls" : 10 , # Adjust based on page size
20
+ "scroll_delay" : 2 , # Seconds between scrolls
21
+ "scroll_to_bottom" : True
22
+ }
27
23
)
28
-
29
- # Parse and print the results
30
- result = CompaniesResponse .model_validate (response ['result' ])
31
- print (f"\n Companies from { batch } batch:" )
32
- print ("=" * 80 )
33
- for company in result .companies :
34
- print (f"Name: { company .name } " )
35
- print (f"Category: { company .category } " )
36
- print (f"Location: { company .location } " )
37
- print ("-" * 80 )
38
-
24
+
25
+ # Process the results
26
+ companies = response .get ("result" , [])
27
+ if not companies :
28
+ print (f"No companies found for batch { batch } " )
29
+ return
30
+
31
+ # Save or process the companies data
32
+ print (f"Found { len (companies )} companies in batch { batch } " )
33
+ for company in companies :
34
+ print (f"Company: { company .get ('name' , 'N/A' )} " )
35
+ print (f"Description: { company .get ('description' , 'N/A' )} " )
36
+ print (f"Website: { company .get ('website' , 'N/A' )} " )
37
+ print ("-" * 50 )
38
+
39
39
except Exception as e :
40
- print (f"Error scraping { batch } batch: { e } " )
40
+ print (f"Error scraping batch { batch } : { str (e )} " )
41
+
41
42
42
43
async def main ():
43
44
# Initialize async client
44
- sgai_client = AsyncClient (api_key = "your-api-key-here" )
45
-
45
+ client = AsyncClient (api_key = "your-api-key-here" )
46
+
46
47
try :
47
- # Define batches to scrape
48
- base_url = "https://www.ycombinator.com/companies"
49
- batches = [
50
- "Spring%202025" ,
51
- "Winter%202025" ,
52
- "Summer%202024"
53
- ]
54
-
48
+ # Example YC batch URLs
49
+ batch_urls = {
50
+ "W24" : "https://www.ycombinator.com/companies?batch=W24" ,
51
+ "S23" : "https://www.ycombinator.com/companies?batch=S23"
52
+ }
53
+
55
54
# Create tasks for each batch
56
55
tasks = [
57
- scrape_companies (sgai_client , base_url , batch )
58
- for batch in batches
56
+ scrape_companies (client , url , batch )
57
+ for batch , url in batch_urls . items ()
59
58
]
60
-
61
- # Execute all scraping tasks concurrently
59
+
60
+ # Execute all batch scraping concurrently
62
61
await asyncio .gather (* tasks )
63
-
64
- except Exception as e :
65
- print (f"An error occurred: { e } " )
66
-
62
+
67
63
finally :
68
- await sgai_client .close ()
64
+ # Ensure client is properly closed
65
+ await client .close ()
66
+
69
67
70
68
if __name__ == "__main__" :
71
69
asyncio .run (main ())
0 commit comments