Skip to content

Commit 8f4cf62

Browse files
author
Dirk Breeuwer
committed
Fix: generated scraper to include source as an argument
1 parent c5da8f5 commit 8f4cf62

File tree

2 files changed

+23
-20
lines changed

2 files changed

+23
-20
lines changed

gpt-scraper.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
from website_analysis.dom_analysis import HtmlLoader, UrlHtmlLoader
66
from scraper_generation.scraper_generator import ScrapingCodeGenerator, CodeWriter
77
from data_extraction.data_extractor import CodeExecutor
8-
from user_requirements_gathering import UserRequirementGatherer
9-
108

119

1210
def main():
@@ -20,7 +18,7 @@ def main():
2018
source_type = args.source_type
2119
USER_REQUIREMENTS = args.requirements
2220

23-
# create HtmlLoader or UrlHtmlLoader based on the source type
21+
# Create HtmlLoader or UrlHtmlLoader based on the source type
2422
def create_html_loader(source, source_type):
2523
if source_type == 'url':
2624
return UrlHtmlLoader(source)
@@ -30,7 +28,7 @@ def create_html_loader(source, source_type):
3028
html_loader = create_html_loader(source, source_type)
3129

3230
# Instantiate ScrapingCodeGenerator with the html_loader
33-
code_generator = ScrapingCodeGenerator(html_loader)
31+
code_generator = ScrapingCodeGenerator(html_loader, source=source, source_type=source_type)
3432

3533
# Generate scraping code
3634
scraping_code = code_generator.generate_scraping_code(USER_REQUIREMENTS)
@@ -51,8 +49,4 @@ def create_html_loader(source, source_type):
5149
main()
5250

5351

54-
55-
56-
57-
5852
# python3 gpt-scraper.py --source-type "file" --source "./results/denver.html" --requirements "Extract the average monthly temperature in denver"

scraper_generation/scraper_generator.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,23 @@
44

55
class ScrapingCodeGenerator:
66
MODEL_NAME = "text-davinci-003"
7-
TEMPLATE = """
7+
SCRAPING_CODE = f"""
8+
from bs4 import BeautifulSoup
9+
from website_analysis.dom_analysis import HtmlLoader, UrlHtmlLoader
10+
11+
# Create HtmlLoader or UrlHtmlLoader based on the source type
12+
def create_html_loader(source, source_type):
13+
if source_type == 'url':
14+
return UrlHtmlLoader(source)
15+
else: # source_type == 'file'
16+
return HtmlLoader(source)
17+
18+
html_loader = create_html_loader("{{source}}", "{{source_type}}")
19+
response = html_loader.load()
20+
21+
html_soup = BeautifulSoup(response, 'html.parser')
22+
"""
23+
PROMPT_TEMPLATE = """
824
You are an expert website analyzer for a web scraping process.
925
Take the user requirements and convert it into clean python code to scrape the website.
1026
@@ -25,27 +41,20 @@ class ScrapingCodeGenerator:
2541
html_soup = BeautifulSoup(response, 'html.parser')
2642
"""
2743

28-
static_code = """
29-
from bs4 import BeautifulSoup
3044

31-
# Get the URL of the website
32-
with open('./results/denver.html') as f:
33-
response = f.read()
34-
35-
html_soup = BeautifulSoup(response, 'html.parser')
36-
"""
3745

38-
def __init__(self, html_loader):
46+
def __init__(self, html_loader, source, source_type):
3947
self.html_loader = html_loader
4048
self.llm = self.initialize_llm()
4149
self.prompt_template = self.initialize_template()
50+
self.scraping_code = self.SCRAPING_CODE.format(source=source, source_type=source_type)
4251

4352
def initialize_llm(self):
4453
load_dotenv()
4554
return OpenAI(model_name=self.MODEL_NAME, temperature=0)
4655

4756
def initialize_template(self):
48-
return PromptTemplate(input_variables=["requirements","html"], template=self.TEMPLATE)
57+
return PromptTemplate(input_variables=["requirements","html"], template=self.PROMPT_TEMPLATE)
4958

5059

5160
def generate_scraping_code(self, user_requirements):
@@ -57,7 +66,7 @@ def generate_scraping_code(self, user_requirements):
5766
generated_code = self.llm(formatted_prompt)
5867

5968
full_scraping_code = f"""
60-
{self.static_code}
69+
{self.scraping_code}
6170
{generated_code}
6271
"""
6372
return full_scraping_code

0 commit comments

Comments
 (0)