|
| 1 | +import concurrent.futures |
| 2 | +import threading |
| 3 | +from urllib.request import Request, urlopen, URLError |
| 4 | +from urllib.parse import urljoin, urlparse |
| 5 | +from bs4 import BeautifulSoup |
| 6 | +import sqlite3 |
| 7 | +import feedparser |
| 8 | +from datetime import date |
| 9 | +import queue |
| 10 | +import time |
| 11 | +import logging |
| 12 | + |
| 13 | +conn = sqlite3.connect('urls.db') |
| 14 | +c = conn.cursor() |
| 15 | +c.execute('CREATE TABLE Urls (urls VARCHAR)') |
| 16 | +links_to_crawl = queue.Queue(maxsize=10) |
| 17 | + |
| 18 | + |
| 19 | +class Crawl: |
| 20 | + base_url = '' |
| 21 | + error_links = set() |
| 22 | + crawled_links = set() |
| 23 | + headers = {} |
| 24 | + |
| 25 | + def __init__(self, headers, base_url): |
| 26 | + Crawl.headers = headers |
| 27 | + Crawl.base_url = base_url |
| 28 | + |
| 29 | + @staticmethod |
| 30 | + def crawl(thread_name, url, links_to_crawl): |
| 31 | + """ |
| 32 | + This method connect to the given url and put it in the corresponding set() or queue. |
| 33 | + If there's an error when parsing the url is added to error_links set. |
| 34 | + Otherwise extract the links of the page with BeautifulSoup and pass this links to the method |
| 35 | + enqueue_links() to well ....be enqueued. |
| 36 | + :param thread_name: Current thread name |
| 37 | + :param url: url to connect |
| 38 | + :param links_to_crawl: queue with the links extracted. |
| 39 | + """ |
| 40 | + try: |
| 41 | + if urlparse(url).netloc == 'planetpython.org' and url not in Crawl.error_links: |
| 42 | + req = Request(url, headers=Crawl.headers) |
| 43 | + response = urlopen(req) |
| 44 | + Crawl.crawled_links.add(url) |
| 45 | + logging.info(f"{thread_name}") |
| 46 | + if response.getcode() == '200': |
| 47 | + logging.info("Connected successfully") |
| 48 | + d = feedparser.parse(f"{url}/rss20.xml") |
| 49 | + date_updated = d.feed.parse.published_parsed |
| 50 | + date_updated = date(date_updated[0], date_updated[1], date_updated[2]) |
| 51 | + difference = date.today() - date_updated |
| 52 | + if difference.days > 365: |
| 53 | + print(f"Feed {url}/rss20.xml is Out of date") |
| 54 | + logging.warning(f"Feed {url}/rss20.xml is Out of date") |
| 55 | + Crawl.error_links.add(url) |
| 56 | + print(f"Url {url} Crawled with status: {response.getcode()}") |
| 57 | + print(f" {len(Crawl.crawled_links)} Crawled in total") |
| 58 | + logging.info(f"Url {url} Crawled with status: {response.getcode()}") |
| 59 | + logging.info(f" {len(Crawl.crawled_links)} Crawled in total") |
| 60 | + soup = BeautifulSoup(str(response.read(), 'utf-8'), 'lxml') |
| 61 | + Crawl.enqueue_links(soup.find_all('a'), links_to_crawl) |
| 62 | + except URLError as err: |
| 63 | + print(f"Url {url} threw this error {err.reason}") |
| 64 | + logging.error(f"Url {url} threw this error {err.reason}") |
| 65 | + Crawl.error_links.add(url) |
| 66 | + |
| 67 | + @staticmethod |
| 68 | + def enqueue_links(links, links_to_crawl): |
| 69 | + """ |
| 70 | + Enqueue the links if not in the set() crawled_links and not in links_to_crawl. |
| 71 | + :param links: |
| 72 | + :param links_to_crawl: queue that contain the links to be crawled. |
| 73 | + """ |
| 74 | + for link in links: |
| 75 | + if urljoin(Crawl.base_url, link.get('href')) not in Crawl.crawled_links: |
| 76 | + if urljoin(Crawl.base_url, link.get('href')) not in links_to_crawl: |
| 77 | + links_to_crawl.put(link) |
| 78 | + logging.info(f"Link {link} just added to the queue links_to_crawl") |
| 79 | + |
| 80 | + |
| 81 | +class Inspection: |
| 82 | + def __init__(self, headers): |
| 83 | + self.headers = headers |
| 84 | + |
| 85 | + @staticmethod |
| 86 | + def read_config(config_file) -> list: |
| 87 | + """ |
| 88 | + Extract urls from config.ini. |
| 89 | + :param config_file: |
| 90 | + """ |
| 91 | + urls = [] |
| 92 | + with open(config_file, 'w') as file: |
| 93 | + for line in file.readlines(): |
| 94 | + urls.append(line) |
| 95 | + return urls |
| 96 | + |
| 97 | + def links_start_page(self, timeout) -> list: |
| 98 | + """ |
| 99 | + Extract links to examine from the start page. |
| 100 | + :param timeout: a timeout to connect. |
| 101 | + :return links: a list with the urls of the home page. |
| 102 | + """ |
| 103 | + req = Request('https://planetpython.org/', headers=self.headers) |
| 104 | + with urlopen(req, timeout=timeout) as resp: |
| 105 | + page = str(resp.read(), 'utf-8') |
| 106 | + |
| 107 | + soup = BeautifulSoup(page, 'lxml') |
| 108 | + children = soup.select("ul.level-one > li") |
| 109 | + extract_sections = [] |
| 110 | + for child in children: |
| 111 | + extract_sections.append(child) |
| 112 | + links = [] |
| 113 | + for i in range(1, 4): |
| 114 | + soup_link = extract_sections[i] |
| 115 | + items = soup_link.select("li > a") |
| 116 | + for item in items: |
| 117 | + s = str(item) |
| 118 | + start_link = s.find('href="') |
| 119 | + end_link = s.find('">', start_link + 1) |
| 120 | + link = s[start_link + 6: end_link] |
| 121 | + links.append(link) |
| 122 | + return links |
| 123 | + |
| 124 | + |
| 125 | +def run(url): |
| 126 | + """ |
| 127 | + Well........... |
| 128 | + :param url: |
| 129 | + """ |
| 130 | + try: |
| 131 | + Crawl.crawl(threading.current_thread(), url, links_to_crawl) |
| 132 | + logging.info(f"Starting to crawl {url} in thread {threading.current_thread()}") |
| 133 | + print("Wasiting 5 secs...") |
| 134 | + logging.info("Waiting 5 secs....") |
| 135 | + time.sleep(5) |
| 136 | + except: |
| 137 | + print(f"Exception thrown with {url}") |
| 138 | + logging.exception(f"Exception thrown with {url}") |
| 139 | + links_to_crawl.task_done() |
| 140 | + |
| 141 | + |
| 142 | +def main(): |
| 143 | + logging.basicConfig(filename='crawler.log', format='%(levelname)s:%(asctime)s %(message)s', |
| 144 | + datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) |
| 145 | + logging.info('Started...') |
| 146 | + headers = { |
| 147 | + 'User-Agent': 'User-Agent:Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 ' |
| 148 | + '(KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53', |
| 149 | + 'Accept_Language': 'en-US, en, q=0.8', |
| 150 | + 'Accept': 'text/html,application/xhtml+xml,application/xml' |
| 151 | + } |
| 152 | + url = 'http://planetpython.org' |
| 153 | + Crawl(headers=headers, base_url=url) |
| 154 | + links_to_crawl.put(url) |
| 155 | + inspection = Inspection(headers=headers) |
| 156 | + urls_to_inspection = [url for url in inspection.read_config('config.ini')] |
| 157 | + urls_to_inspection.append(inspection.links_start_page(timeout=60)) |
| 158 | + for link in urls_to_inspection: |
| 159 | + links_to_crawl.put(link) |
| 160 | + while not links_to_crawl.empty(): |
| 161 | + with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: |
| 162 | + url = links_to_crawl.get() |
| 163 | + logging.info(f"Url {url} just pull out of the queue links_to_crawl") |
| 164 | + futures = [] |
| 165 | + if url is not None: |
| 166 | + future = executor.submit(run, url) |
| 167 | + futures.append(future) |
| 168 | + |
| 169 | + for future in concurrent.futures.as_completed(futures): |
| 170 | + try: |
| 171 | + if future.result() is not None: |
| 172 | + c.execute('INSERT INTO Urls (urls,) VALUE (?,)', (future.result(),)) |
| 173 | + logging.info(f"{future.result} added to the database") |
| 174 | + except: |
| 175 | + print(future.exception()) |
| 176 | + logging.exception(f"The following exception threw: {future.exception()}") |
| 177 | + |
| 178 | + logging.info('...Finished') |
| 179 | + |
| 180 | + |
| 181 | +conn.commit() |
| 182 | +conn.close() |
| 183 | +print(f"Total links Crawled {len(Crawl.crawled_links)}") |
| 184 | +print(f"Total errors {len(Crawl.error_links)}") |
| 185 | + |
| 186 | +if __name__ == '__main__': |
| 187 | + main() |
0 commit comments