Skip to content

Commit f0d7c85

Browse files
authored
Update scraper.py
1 parent 112ba7a commit f0d7c85

File tree

1 file changed

+10
-12
lines changed

1 file changed

+10
-12
lines changed

scraper.py

+10-12
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,28 @@
11
import requests
22
from bs4 import BeautifulSoup
33

4-
proxy = {'http': 'http://SPusername:SPpassword@gate.smartproxy.com:7000'}
5-
url = 'http://books.toscrape.com/catalogue/page-1.html'
4+
proxy = {'http': 'http://username:password@gate.smartproxy.com:10000'} # Proxy authentication information
5+
url = 'http://books.toscrape.com/' # Website to make a GET request to
66

7-
r = requests.get(url, proxies=proxy)
8-
html = BeautifulSoup(r.content, 'html.parser')
7+
r = requests.get(url, proxies=proxy) # Make the GET request to a target URL using proxies
8+
html = BeautifulSoup(r.content, 'html.parser') # Parse the HTML
99

10-
all_books = html.find_all('article', class_='product_pod')
10+
all_books = html.find_all('article', class_='product_pod') # Find all article elements with the class "product_pod"
1111

12-
for book in all_books:
12+
for book in all_books: # Loop through each element and find the title, price, availability, and description
1313
title = book.h3.a['title']
1414
price = book.find('p', class_='price_color').text
1515
availability = book.find('p', class_ ='instock availability').text.strip()
1616
link_to_book = book.h3.a['href']
17+
link = "http://books.toscrape.com/{0}".format(link_to_book)
1718

18-
link = "http://books.toscrape.com/catalogue/{0}".format(link_to_book)
19-
20-
r2 = requests.get(link)
19+
r2 = requests.get(link, proxies=proxy) # Make a new request to the URL extracted earlier
2120
html2 = BeautifulSoup(r2.content, 'html.parser')
22-
2321
description = html2.find('p', class_='').text
2422

2523
print(title)
2624
print(price)
2725
print(availability)
28-
print("{0}...".format(description[:150]))
26+
print("{0}...".format(description[:150])) # Truncate text that is too long (over 150 characters)
2927
print(link)
30-
print()
28+
print() # Print an empty line to separate each result

0 commit comments

Comments
 (0)