1
1
import requests
2
2
from bs4 import BeautifulSoup
3
3
4
- proxy = {'http' : 'http://SPusername:SPpassword @gate.smartproxy.com:7000' }
5
- url = 'http://books.toscrape.com/catalogue/page-1.html'
4
+ proxy = {'http' : 'http://username:password @gate.smartproxy.com:10000' } # Proxy authentication information
5
+ url = 'http://books.toscrape.com/' # Website to make a GET request to
6
6
7
- r = requests .get (url , proxies = proxy )
8
- html = BeautifulSoup (r .content , 'html.parser' )
7
+ r = requests .get (url , proxies = proxy ) # Make the GET request to a target URL using proxies
8
+ html = BeautifulSoup (r .content , 'html.parser' ) # Parse the HTML
9
9
10
- all_books = html .find_all ('article' , class_ = 'product_pod' )
10
+ all_books = html .find_all ('article' , class_ = 'product_pod' ) # Find all article elements with the class "product_pod"
11
11
12
- for book in all_books :
12
+ for book in all_books : # Loop through each element and find the title, price, availability, and description
13
13
title = book .h3 .a ['title' ]
14
14
price = book .find ('p' , class_ = 'price_color' ).text
15
15
availability = book .find ('p' , class_ = 'instock availability' ).text .strip ()
16
16
link_to_book = book .h3 .a ['href' ]
17
+ link = "http://books.toscrape.com/{0}" .format (link_to_book )
17
18
18
- link = "http://books.toscrape.com/catalogue/{0}" .format (link_to_book )
19
-
20
- r2 = requests .get (link )
19
+ r2 = requests .get (link , proxies = proxy ) # Make a new request to the URL extracted earlier
21
20
html2 = BeautifulSoup (r2 .content , 'html.parser' )
22
-
23
21
description = html2 .find ('p' , class_ = '' ).text
24
22
25
23
print (title )
26
24
print (price )
27
25
print (availability )
28
- print ("{0}..." .format (description [:150 ]))
26
+ print ("{0}..." .format (description [:150 ])) # Truncate text that is too long (over 150 characters)
29
27
print (link )
30
- print ()
28
+ print () # Print an empty line to separate each result
0 commit comments