Skip to content

Commit c5da8f5

Browse files
author
Dirk Breeuwer
committed
Refactor prompt templates from gpt scraper to ScraperGenerator
1 parent 969b2fd commit c5da8f5

File tree

2 files changed

+49
-49
lines changed

2 files changed

+49
-49
lines changed

gpt-scraper.py

Lines changed: 10 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
from langchain.llms import OpenAI
44
import argparse
55
from website_analysis.dom_analysis import HtmlLoader, UrlHtmlLoader
6-
from scraper_generation.scraper_generator import ScrapingCodeGenerator, CodeWriter
6+
from scraper_generation.scraper_generator import ScrapingCodeGenerator, CodeWriter
77
from data_extraction.data_extractor import CodeExecutor
8+
from user_requirements_gathering import UserRequirementGatherer
89

910

1011

@@ -19,48 +20,17 @@ def main():
1920
source_type = args.source_type
2021
USER_REQUIREMENTS = args.requirements
2122

22-
23-
24-
# Variables to define
25-
MODEL_NAME = "text-davinci-003"
26-
TEMPLATE = """
27-
You are an expert website analyzer for a web scraping process.
28-
Take the user requirements and convert it into clean python code to scrape the website.
29-
30-
USER REQUIREMENTS:
31-
{requirements}
32-
33-
HTML CODE YOU NEED TO SCRAPE:
34-
{html}
35-
36-
FINISH THE PYTHON CODE TO SCRAPE THE WEBSITE:
37-
38-
from bs4 import BeautifulSoup
39-
40-
# Get the URL of the website
41-
with open('./results/denver.html') as f:
42-
response = f.read()
43-
44-
html_soup = BeautifulSoup(response, 'html.parser')
45-
"""
46-
47-
# Load environment variables
48-
load_dotenv()
49-
50-
# Create the LLM
51-
llm = OpenAI(model_name=MODEL_NAME, temperature=0)
52-
53-
# Create prompt template
54-
prompt_template = PromptTemplate(input_variables=["requirements","html"], template=TEMPLATE)
55-
5623
# create HtmlLoader or UrlHtmlLoader based on the source type
57-
if source_type == 'url':
58-
html_loader = UrlHtmlLoader(source)
59-
else: # source_type == 'file'
60-
html_loader = HtmlLoader(source)
24+
def create_html_loader(source, source_type):
25+
if source_type == 'url':
26+
return UrlHtmlLoader(source)
27+
else: # source_type == 'file'
28+
return HtmlLoader(source)
29+
30+
html_loader = create_html_loader(source, source_type)
6131

6232
# Instantiate ScrapingCodeGenerator with the html_loader
63-
code_generator = ScrapingCodeGenerator(html_loader, llm, prompt_template)
33+
code_generator = ScrapingCodeGenerator(html_loader)
6434

6535
# Generate scraping code
6636
scraping_code = code_generator.generate_scraping_code(USER_REQUIREMENTS)

scraper_generation/scraper_generator.py

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,28 @@
33
from langchain import PromptTemplate
44

55
class ScrapingCodeGenerator:
6+
MODEL_NAME = "text-davinci-003"
7+
TEMPLATE = """
8+
You are an expert website analyzer for a web scraping process.
9+
Take the user requirements and convert it into clean python code to scrape the website.
10+
11+
USER REQUIREMENTS:
12+
{requirements}
13+
14+
HTML CODE YOU NEED TO SCRAPE:
15+
{html}
16+
17+
FINISH THE PYTHON CODE TO SCRAPE THE WEBSITE:
18+
19+
from bs4 import BeautifulSoup
20+
21+
# Get the URL of the website
22+
with open('./results/denver.html') as f:
23+
response = f.read()
24+
25+
html_soup = BeautifulSoup(response, 'html.parser')
26+
"""
27+
628
static_code = """
729
from bs4 import BeautifulSoup
830
@@ -11,11 +33,20 @@ class ScrapingCodeGenerator:
1133
response = f.read()
1234
1335
html_soup = BeautifulSoup(response, 'html.parser')
14-
"""
15-
def __init__(self, html_loader, llm, prompt_template):
36+
"""
37+
38+
def __init__(self, html_loader):
1639
self.html_loader = html_loader
17-
self.llm = llm
18-
self.prompt_template = prompt_template
40+
self.llm = self.initialize_llm()
41+
self.prompt_template = self.initialize_template()
42+
43+
def initialize_llm(self):
44+
load_dotenv()
45+
return OpenAI(model_name=self.MODEL_NAME, temperature=0)
46+
47+
def initialize_template(self):
48+
return PromptTemplate(input_variables=["requirements","html"], template=self.TEMPLATE)
49+
1950

2051
def generate_scraping_code(self, user_requirements):
2152
"""
@@ -26,11 +57,11 @@ def generate_scraping_code(self, user_requirements):
2657
generated_code = self.llm(formatted_prompt)
2758

2859
full_scraping_code = f"""
29-
{self.static_code}
30-
{generated_code}
60+
{self.static_code}
61+
{generated_code}
3162
"""
3263
return full_scraping_code
33-
64+
3465
class CodeWriter:
3566
def __init__(self, file_name):
3667
self.file_name = file_name
@@ -40,5 +71,4 @@ def write(self, scraping_code):
4071
Writes the scraping code to a .py python file
4172
"""
4273
with open(self.file_name, 'w') as file:
43-
file.write(scraping_code)
44-
74+
file.write(scraping_code)

0 commit comments

Comments
 (0)