Skip to content

Commit 52ad276

Browse files
author
Gealber
committed
Gealber PCC49, need some changes
1 parent d2cfca0 commit 52ad276

File tree

2 files changed

+187
-0
lines changed

2 files changed

+187
-0
lines changed

49/Gealber/config.ini

Whitespace-only changes.

49/Gealber/crawl_verifier.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
import concurrent.futures
2+
import threading
3+
from urllib.request import Request, urlopen, URLError
4+
from urllib.parse import urljoin, urlparse
5+
from bs4 import BeautifulSoup
6+
import sqlite3
7+
import feedparser
8+
from datetime import date
9+
import queue
10+
import time
11+
import logging
12+
13+
conn = sqlite3.connect('urls.db')
14+
c = conn.cursor()
15+
c.execute('CREATE TABLE Urls (urls VARCHAR)')
16+
links_to_crawl = queue.Queue(maxsize=10)
17+
18+
19+
class Crawl:
20+
base_url = ''
21+
error_links = set()
22+
crawled_links = set()
23+
headers = {}
24+
25+
def __init__(self, headers, base_url):
26+
Crawl.headers = headers
27+
Crawl.base_url = base_url
28+
29+
@staticmethod
30+
def crawl(thread_name, url, links_to_crawl):
31+
"""
32+
This method connect to the given url and put it in the corresponding set() or queue.
33+
If there's an error when parsing the url is added to error_links set.
34+
Otherwise extract the links of the page with BeautifulSoup and pass this links to the method
35+
enqueue_links() to well ....be enqueued.
36+
:param thread_name: Current thread name
37+
:param url: url to connect
38+
:param links_to_crawl: queue with the links extracted.
39+
"""
40+
try:
41+
if urlparse(url).netloc == 'planetpython.org' and url not in Crawl.error_links:
42+
req = Request(url, headers=Crawl.headers)
43+
response = urlopen(req)
44+
Crawl.crawled_links.add(url)
45+
logging.info(f"{thread_name}")
46+
if response.getcode() == '200':
47+
logging.info("Connected successfully")
48+
d = feedparser.parse(f"{url}/rss20.xml")
49+
date_updated = d.feed.parse.published_parsed
50+
date_updated = date(date_updated[0], date_updated[1], date_updated[2])
51+
difference = date.today() - date_updated
52+
if difference.days > 365:
53+
print(f"Feed {url}/rss20.xml is Out of date")
54+
logging.warning(f"Feed {url}/rss20.xml is Out of date")
55+
Crawl.error_links.add(url)
56+
print(f"Url {url} Crawled with status: {response.getcode()}")
57+
print(f" {len(Crawl.crawled_links)} Crawled in total")
58+
logging.info(f"Url {url} Crawled with status: {response.getcode()}")
59+
logging.info(f" {len(Crawl.crawled_links)} Crawled in total")
60+
soup = BeautifulSoup(str(response.read(), 'utf-8'), 'lxml')
61+
Crawl.enqueue_links(soup.find_all('a'), links_to_crawl)
62+
except URLError as err:
63+
print(f"Url {url} threw this error {err.reason}")
64+
logging.error(f"Url {url} threw this error {err.reason}")
65+
Crawl.error_links.add(url)
66+
67+
@staticmethod
68+
def enqueue_links(links, links_to_crawl):
69+
"""
70+
Enqueue the links if not in the set() crawled_links and not in links_to_crawl.
71+
:param links:
72+
:param links_to_crawl: queue that contain the links to be crawled.
73+
"""
74+
for link in links:
75+
if urljoin(Crawl.base_url, link.get('href')) not in Crawl.crawled_links:
76+
if urljoin(Crawl.base_url, link.get('href')) not in links_to_crawl:
77+
links_to_crawl.put(link)
78+
logging.info(f"Link {link} just added to the queue links_to_crawl")
79+
80+
81+
class Inspection:
82+
def __init__(self, headers):
83+
self.headers = headers
84+
85+
@staticmethod
86+
def read_config(config_file) -> list:
87+
"""
88+
Extract urls from config.ini.
89+
:param config_file:
90+
"""
91+
urls = []
92+
with open(config_file, 'w') as file:
93+
for line in file.readlines():
94+
urls.append(line)
95+
return urls
96+
97+
def links_start_page(self, timeout) -> list:
98+
"""
99+
Extract links to examine from the start page.
100+
:param timeout: a timeout to connect.
101+
:return links: a list with the urls of the home page.
102+
"""
103+
req = Request('https://planetpython.org/', headers=self.headers)
104+
with urlopen(req, timeout=timeout) as resp:
105+
page = str(resp.read(), 'utf-8')
106+
107+
soup = BeautifulSoup(page, 'lxml')
108+
children = soup.select("ul.level-one > li")
109+
extract_sections = []
110+
for child in children:
111+
extract_sections.append(child)
112+
links = []
113+
for i in range(1, 4):
114+
soup_link = extract_sections[i]
115+
items = soup_link.select("li > a")
116+
for item in items:
117+
s = str(item)
118+
start_link = s.find('href="')
119+
end_link = s.find('">', start_link + 1)
120+
link = s[start_link + 6: end_link]
121+
links.append(link)
122+
return links
123+
124+
125+
def run(url):
126+
"""
127+
Well...........
128+
:param url:
129+
"""
130+
try:
131+
Crawl.crawl(threading.current_thread(), url, links_to_crawl)
132+
logging.info(f"Starting to crawl {url} in thread {threading.current_thread()}")
133+
print("Wasiting 5 secs...")
134+
logging.info("Waiting 5 secs....")
135+
time.sleep(5)
136+
except:
137+
print(f"Exception thrown with {url}")
138+
logging.exception(f"Exception thrown with {url}")
139+
links_to_crawl.task_done()
140+
141+
142+
def main():
143+
logging.basicConfig(filename='crawler.log', format='%(levelname)s:%(asctime)s %(message)s',
144+
datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
145+
logging.info('Started...')
146+
headers = {
147+
'User-Agent': 'User-Agent:Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 '
148+
'(KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53',
149+
'Accept_Language': 'en-US, en, q=0.8',
150+
'Accept': 'text/html,application/xhtml+xml,application/xml'
151+
}
152+
url = 'http://planetpython.org'
153+
Crawl(headers=headers, base_url=url)
154+
links_to_crawl.put(url)
155+
inspection = Inspection(headers=headers)
156+
urls_to_inspection = [url for url in inspection.read_config('config.ini')]
157+
urls_to_inspection.append(inspection.links_start_page(timeout=60))
158+
for link in urls_to_inspection:
159+
links_to_crawl.put(link)
160+
while not links_to_crawl.empty():
161+
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
162+
url = links_to_crawl.get()
163+
logging.info(f"Url {url} just pull out of the queue links_to_crawl")
164+
futures = []
165+
if url is not None:
166+
future = executor.submit(run, url)
167+
futures.append(future)
168+
169+
for future in concurrent.futures.as_completed(futures):
170+
try:
171+
if future.result() is not None:
172+
c.execute('INSERT INTO Urls (urls,) VALUE (?,)', (future.result(),))
173+
logging.info(f"{future.result} added to the database")
174+
except:
175+
print(future.exception())
176+
logging.exception(f"The following exception threw: {future.exception()}")
177+
178+
logging.info('...Finished')
179+
180+
181+
conn.commit()
182+
conn.close()
183+
print(f"Total links Crawled {len(Crawl.crawled_links)}")
184+
print(f"Total errors {len(Crawl.error_links)}")
185+
186+
if __name__ == '__main__':
187+
main()

0 commit comments

Comments
 (0)