forked from chrisgioia64/GovernmentEntityScraper
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathurl_checker.py
92 lines (74 loc) · 2.64 KB
/
url_checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from os import stat
import openpyxl
from pathlib import Path
from bs4 import BeautifulSoup
import requests
import logging
import datetime
import sys
# Setup logging
now = datetime.datetime.now()
date_time = now.strftime("%Y-%m-%d %H-%M-%S")
logger = logging.getLogger()
handler = logging.FileHandler(
mode="w", filename="logs/url_checker_" + date_time + ".txt")
handler.setLevel(logging.INFO)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
logger.addHandler(handler)
"""
Return the status code of the url using the 'requests' library.
Returns -1 if there was an exception parsing
"""
def getStatusCode(url):
success = -1
try:
r = requests.get(url, timeout=10)
success = r.status_code
# logger.info("keys: " + str(r.text))
# if success != 200:
# logger.info(str(url) + " --- " + str(r.text))
except Exception as e:
logger.info("get url: " + str(url) + " " + str(e))
return -1
return success
"""
excel_filename -- an excel file where the active spreadsheet is a listing of Government entities
with an "entity name" columns and a "url" column specified by the function parameters
"entityNameColumn" and "entityUrlColumn" respectively.
"""
def getUrlResults(excel_filename, entityNameColumn, entityUrlColumn, header_exists=True, debug=False):
xlsx_file = Path(excel_filename)
wb_obj = openpyxl.load_workbook(xlsx_file)
wsheet = wb_obj.active
rowNumber = 0
if header_exists:
rowNumber = 1
statusCodeMap = {}
collection = []
for row in wsheet.iter_rows(max_row=wsheet.max_row):
rowNumber += 1
nameCell = entityNameColumn + str(rowNumber)
urlCell = entityUrlColumn + str(rowNumber)
entityName = wsheet[nameCell].value
entityUrl = wsheet[urlCell].value
statusCode = getStatusCode(entityUrl)
statusCodeMap[statusCode] = statusCodeMap.get(statusCode, 0) + 1
if debug:
logging.info("%s -- %s (%s)" % (entityName, entityUrl, statusCode))
entityUrlInfo = EntityUrlInfo(entityName, entityUrl, statusCode)
collection.append(entityUrlInfo)
if debug:
logging.info("status code map: %s" % (statusCodeMap))
return EntityUrlCollection(collection, statusCodeMap)
class EntityUrlCollection:
def __init__(self, collection, map):
self.collection = collection
self.map = map
class EntityUrlInfo:
def __init__(self, entityName, entityUrl, statusCode):
self.entityName = entityName
self.entityUrl = entityUrl
self.statusCode = statusCode