-
Notifications
You must be signed in to change notification settings - Fork 36
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Accidentally used the common scripts instead of the base_scripts
- Loading branch information
1 parent
b9216e7
commit 47e52f5
Showing
2 changed files
with
36 additions
and
174 deletions.
There are no files selected for viewing
90 changes: 12 additions & 78 deletions
90
setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_bulletin.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,87 +1,21 @@ | ||
import sys | ||
import os | ||
import requests | ||
import json | ||
from pathlib import Path | ||
from bs4 import BeautifulSoup | ||
import pandas as pd | ||
from tqdm import tqdm | ||
import time | ||
import CG_configs as configs | ||
from pathlib import Path | ||
|
||
# This is a hack that loads that root common folder like a module (without you expressly needing to install it). | ||
# I'm going to be honest, I have no clue why it uses parents[1] while the list_pdf scrapesr use parents[3] | ||
p = Path(__file__).resolve().parents[1] | ||
p = Path(__file__).resolve().parents[5] | ||
sys.path.insert(1, str(p)) | ||
from common.base_scrapers import crimegraphics_bulletin | ||
|
||
# import hash_comparer, page_hasher, and page_update from common/utils/website_hasher/page_update.py | ||
from common.utils import hash_comparer, page_hasher, page_update | ||
|
||
# import data_parser from common/crimegraphics/utils/data_parser.py | ||
from crimegraphics.utils import data_parser | ||
|
||
# this function is used for gathering time stats | ||
def function_timer(stats): | ||
if stats != False: | ||
return time.perf_counter() | ||
|
||
|
||
# this function simply calculates and prints the difference between the end and start times | ||
def time_dif(stats, string, start, end): | ||
if stats != False: | ||
print(f"{string}: {end - start} seconds") | ||
|
||
|
||
# configs = { | ||
# "url": "", | ||
# "department_code": "", | ||
# } | ||
|
||
# Stats default to False | ||
def crimegraphics_bulletin(configs, save_dir, stats=False, configs_file=False): | ||
if not configs_file: # Default setting | ||
department_code = configs["department_code"] | ||
url = configs["url"] | ||
else: | ||
department_code = configs.department_code | ||
url = configs.url | ||
|
||
# Automatically have the CLERYMenu clicked for daily crime data | ||
payload = { | ||
"MYAGCODE": configs.department_code, | ||
"__EVENTTARGET": "MainMenu$BulletinMenu", | ||
"__EVENTARGUMENT": "BulletinMenu", | ||
} | ||
|
||
# Initialize "data" table (a table called data, not a datatable) | ||
data = [] | ||
|
||
print("Receiving Data... Please wait...") | ||
request_start = function_timer(stats) | ||
|
||
# Send a POST request to the url with our headers | ||
response = requests.request("POST", configs.url, data=payload) | ||
request_end = function_timer(stats) | ||
time_dif(stats, "Request Time", request_start, request_end) | ||
|
||
print("Data received.") | ||
parse_start = function_timer(stats) | ||
|
||
# Parse the response using bs4 | ||
soup = BeautifulSoup(response.text, "html.parser") | ||
# with open("html.html", 'wb') as output: | ||
# output.write(str(soup).encode('utf-8')) | ||
# output.close() | ||
parse_end = function_timer(stats) | ||
time_dif(stats, "Parse time", parse_start, parse_end) | ||
configs = { | ||
"url": "", | ||
"department_code": "", | ||
} | ||
|
||
search_start = function_timer(stats) | ||
save_dir = "./data/" | ||
data = [] | ||
|
||
table = soup.find("span", id="Bull") | ||
# Send "table" to page_update to be hashed and compared. | ||
page_update(table) | ||
search_end = function_timer(stats) | ||
time_dif(stats, "Search time", search_start, search_end) | ||
if not os.path.exists(save_dir): | ||
os.makedirs(save_dir) | ||
|
||
# Import the parser | ||
data_parser(configs, save_dir, table) | ||
crimegraphics_bulletin(configs, save_dir) |
120 changes: 24 additions & 96 deletions
120
setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_clery.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,101 +1,29 @@ | ||
import sys | ||
import os | ||
import requests | ||
import json | ||
from pathlib import Path | ||
from bs4 import BeautifulSoup | ||
import pandas as pd | ||
from tqdm import tqdm | ||
import time | ||
from datetime import date | ||
import CG_configs as configs | ||
from pathlib import Path | ||
|
||
# This is a hack that loads that root common folder like a module (without you expressly needing to install it). | ||
# I'm going to be honest, I have no clue why it uses parents[1] while the list_pdf scrapesr use parents[3] | ||
p = Path(__file__).resolve().parents[1] | ||
p = Path(__file__).resolve().parents[5] | ||
sys.path.insert(1, str(p)) | ||
|
||
# import hash_comparer, page_hasher, and page_update from common/utils/website_hasher/page_update.py | ||
from common.utils import hash_comparer, page_hasher, page_update | ||
|
||
# this function is used for gathering time stats | ||
def function_timer(stats): | ||
if stats != False: | ||
return time.perf_counter() | ||
|
||
|
||
# this function simply calculates and prints the difference between the end and start times | ||
def time_dif(stats, string, start, end): | ||
if stats != False: | ||
print(f"{string}: {end - start} seconds") | ||
|
||
|
||
# stats default to False | ||
def crimegraphics_clery(configs, save_dir, stats=False, configs_file=False): | ||
if not configs_file: # Default setting | ||
department_code = configs["department_code"] | ||
url = configs["url"] | ||
list_header = configs["list_header"] | ||
else: | ||
department_code = configs.department_code | ||
url = configs.url | ||
list_header = configs.list_header | ||
|
||
# automatically have the CLERYMenu clicked for daily crime data | ||
payload = { | ||
"MYAGCODE": configs.department_code, | ||
"__EVENTTARGET": "MainMenu$CLERYMenu", | ||
"__EVENTARGUMENT": "CLERYMenu", | ||
} | ||
|
||
# initialize "data" table (a table called data, not a datatable) | ||
data = [] | ||
|
||
print("Receiving Data... Please wait...") | ||
|
||
# used for stats, mark beginning of request | ||
request_start = function_timer(stats) | ||
|
||
# Send a POST request to the url with our headers | ||
response = requests.request("POST", configs.url, data=payload) | ||
request_end = function_timer(stats) | ||
time_dif(stats, "Request Time", request_start, request_end) | ||
|
||
print("Data received.") | ||
parse_start = function_timer(stats) | ||
|
||
# Parse the response using bs4 | ||
soup = BeautifulSoup(response.text, "html.parser") | ||
parse_end = function_timer(stats) | ||
time_dif(stats, "Parse time", parse_start, parse_end) | ||
|
||
search_start = function_timer(stats) | ||
# this website has a bunch of empty tables with the same name | ||
# the 6th index has the data we need | ||
table = soup.find_all("table", {"class": "ob_gBody"})[6] | ||
search_end = function_timer(stats) | ||
time_dif(stats, "Search time", search_start, search_end) | ||
|
||
hash_start = function_timer(stats) | ||
# Checks if the page has been updated | ||
page_update(table) | ||
|
||
hash_end = function_timer(stats) | ||
time_dif(stats, "Hash time", hash_start, hash_end) | ||
|
||
# Use BeautifulSoup4 (bs4)'s find_all method to find all html table rows (tr) | ||
rows = table.find_all("tr") | ||
for row in tqdm(rows): | ||
# Use BeautifulSoup4 (bs4)'s find_all method to find all html tags for table data (td) | ||
td = row.find_all("td") | ||
table_data = [] | ||
for actual_data in td: | ||
table_data.append(actual_data.get_text()) | ||
data.append(table_data) | ||
|
||
date_name = date.today() | ||
file_name = "_" + str(date_name).replace("-", "_") # + "_" | ||
|
||
dataframe = pd.DataFrame(data=data, columns=configs.list_header) | ||
|
||
dataframe.to_csv(save_dir + configs.department_code + file_name + "_daily_bulletin") | ||
from common.base_scrapers import crimegraphics_scraper | ||
|
||
configs = { | ||
"url": "", | ||
"department_code": "", | ||
"list_header": [ | ||
"ChargeDescription", | ||
"CaseNum", | ||
"ReportDate", | ||
"OffenseDate", | ||
"Location", | ||
"ChargeDisposition", | ||
], | ||
} | ||
|
||
save_dir = "./data/" | ||
data = [] | ||
|
||
if not os.path.exists(save_dir): | ||
os.makedirs(save_dir) | ||
|
||
crimegraphics_scraper(configs, save_dir) |