4
4
5
5
class ScrapingCodeGenerator :
6
6
MODEL_NAME = "text-davinci-003"
7
- TEMPLATE = """
7
+ SCRAPING_CODE = f"""
8
+ from bs4 import BeautifulSoup
9
+ from website_analysis.dom_analysis import HtmlLoader, UrlHtmlLoader
10
+
11
+ # Create HtmlLoader or UrlHtmlLoader based on the source type
12
+ def create_html_loader(source, source_type):
13
+ if source_type == 'url':
14
+ return UrlHtmlLoader(source)
15
+ else: # source_type == 'file'
16
+ return HtmlLoader(source)
17
+
18
+ html_loader = create_html_loader("{{source}}", "{{source_type}}")
19
+ response = html_loader.load()
20
+
21
+ html_soup = BeautifulSoup(response, 'html.parser')
22
+ """
23
+ PROMPT_TEMPLATE = """
8
24
You are an expert website analyzer for a web scraping process.
9
25
Take the user requirements and convert it into clean python code to scrape the website.
10
26
@@ -25,27 +41,20 @@ class ScrapingCodeGenerator:
25
41
html_soup = BeautifulSoup(response, 'html.parser')
26
42
"""
27
43
28
- static_code = """
29
- from bs4 import BeautifulSoup
30
44
31
- # Get the URL of the website
32
- with open('./results/denver.html') as f:
33
- response = f.read()
34
-
35
- html_soup = BeautifulSoup(response, 'html.parser')
36
- """
37
45
38
- def __init__ (self , html_loader ):
46
+ def __init__ (self , html_loader , source , source_type ):
39
47
self .html_loader = html_loader
40
48
self .llm = self .initialize_llm ()
41
49
self .prompt_template = self .initialize_template ()
50
+ self .scraping_code = self .SCRAPING_CODE .format (source = source , source_type = source_type )
42
51
43
52
def initialize_llm (self ):
44
53
load_dotenv ()
45
54
return OpenAI (model_name = self .MODEL_NAME , temperature = 0 )
46
55
47
56
def initialize_template (self ):
48
- return PromptTemplate (input_variables = ["requirements" ,"html" ], template = self .TEMPLATE )
57
+ return PromptTemplate (input_variables = ["requirements" ,"html" ], template = self .PROMPT_TEMPLATE )
49
58
50
59
51
60
def generate_scraping_code (self , user_requirements ):
@@ -57,7 +66,7 @@ def generate_scraping_code(self, user_requirements):
57
66
generated_code = self .llm (formatted_prompt )
58
67
59
68
full_scraping_code = f"""
60
- { self .static_code }
69
+ { self .scraping_code }
61
70
{ generated_code }
62
71
"""
63
72
return full_scraping_code
0 commit comments