Skip to content

Commit

Permalink
Accidentally used the common scripts instead of the base_scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
CaptainStabs committed Aug 10, 2021
1 parent b9216e7 commit 47e52f5
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 174 deletions.
Original file line number Diff line number Diff line change
@@ -1,87 +1,21 @@
import sys
import os
import requests
import json
from pathlib import Path
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import CG_configs as configs
from pathlib import Path

# This is a hack that loads that root common folder like a module (without you expressly needing to install it).
# I'm going to be honest, I have no clue why it uses parents[1] while the list_pdf scrapesr use parents[3]
p = Path(__file__).resolve().parents[1]
p = Path(__file__).resolve().parents[5]
sys.path.insert(1, str(p))
from common.base_scrapers import crimegraphics_bulletin

# import hash_comparer, page_hasher, and page_update from common/utils/website_hasher/page_update.py
from common.utils import hash_comparer, page_hasher, page_update

# import data_parser from common/crimegraphics/utils/data_parser.py
from crimegraphics.utils import data_parser

# this function is used for gathering time stats
def function_timer(stats):
if stats != False:
return time.perf_counter()


# this function simply calculates and prints the difference between the end and start times
def time_dif(stats, string, start, end):
if stats != False:
print(f"{string}: {end - start} seconds")


# configs = {
# "url": "",
# "department_code": "",
# }

# Stats default to False
def crimegraphics_bulletin(configs, save_dir, stats=False, configs_file=False):
if not configs_file: # Default setting
department_code = configs["department_code"]
url = configs["url"]
else:
department_code = configs.department_code
url = configs.url

# Automatically have the CLERYMenu clicked for daily crime data
payload = {
"MYAGCODE": configs.department_code,
"__EVENTTARGET": "MainMenu$BulletinMenu",
"__EVENTARGUMENT": "BulletinMenu",
}

# Initialize "data" table (a table called data, not a datatable)
data = []

print("Receiving Data... Please wait...")
request_start = function_timer(stats)

# Send a POST request to the url with our headers
response = requests.request("POST", configs.url, data=payload)
request_end = function_timer(stats)
time_dif(stats, "Request Time", request_start, request_end)

print("Data received.")
parse_start = function_timer(stats)

# Parse the response using bs4
soup = BeautifulSoup(response.text, "html.parser")
# with open("html.html", 'wb') as output:
# output.write(str(soup).encode('utf-8'))
# output.close()
parse_end = function_timer(stats)
time_dif(stats, "Parse time", parse_start, parse_end)
configs = {
"url": "",
"department_code": "",
}

search_start = function_timer(stats)
save_dir = "./data/"
data = []

table = soup.find("span", id="Bull")
# Send "table" to page_update to be hashed and compared.
page_update(table)
search_end = function_timer(stats)
time_dif(stats, "Search time", search_start, search_end)
if not os.path.exists(save_dir):
os.makedirs(save_dir)

# Import the parser
data_parser(configs, save_dir, table)
crimegraphics_bulletin(configs, save_dir)
120 changes: 24 additions & 96 deletions setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_clery.py
Original file line number Diff line number Diff line change
@@ -1,101 +1,29 @@
import sys
import os
import requests
import json
from pathlib import Path
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
from datetime import date
import CG_configs as configs
from pathlib import Path

# This is a hack that loads that root common folder like a module (without you expressly needing to install it).
# I'm going to be honest, I have no clue why it uses parents[1] while the list_pdf scrapesr use parents[3]
p = Path(__file__).resolve().parents[1]
p = Path(__file__).resolve().parents[5]
sys.path.insert(1, str(p))

# import hash_comparer, page_hasher, and page_update from common/utils/website_hasher/page_update.py
from common.utils import hash_comparer, page_hasher, page_update

# this function is used for gathering time stats
def function_timer(stats):
if stats != False:
return time.perf_counter()


# this function simply calculates and prints the difference between the end and start times
def time_dif(stats, string, start, end):
if stats != False:
print(f"{string}: {end - start} seconds")


# stats default to False
def crimegraphics_clery(configs, save_dir, stats=False, configs_file=False):
if not configs_file: # Default setting
department_code = configs["department_code"]
url = configs["url"]
list_header = configs["list_header"]
else:
department_code = configs.department_code
url = configs.url
list_header = configs.list_header

# automatically have the CLERYMenu clicked for daily crime data
payload = {
"MYAGCODE": configs.department_code,
"__EVENTTARGET": "MainMenu$CLERYMenu",
"__EVENTARGUMENT": "CLERYMenu",
}

# initialize "data" table (a table called data, not a datatable)
data = []

print("Receiving Data... Please wait...")

# used for stats, mark beginning of request
request_start = function_timer(stats)

# Send a POST request to the url with our headers
response = requests.request("POST", configs.url, data=payload)
request_end = function_timer(stats)
time_dif(stats, "Request Time", request_start, request_end)

print("Data received.")
parse_start = function_timer(stats)

# Parse the response using bs4
soup = BeautifulSoup(response.text, "html.parser")
parse_end = function_timer(stats)
time_dif(stats, "Parse time", parse_start, parse_end)

search_start = function_timer(stats)
# this website has a bunch of empty tables with the same name
# the 6th index has the data we need
table = soup.find_all("table", {"class": "ob_gBody"})[6]
search_end = function_timer(stats)
time_dif(stats, "Search time", search_start, search_end)

hash_start = function_timer(stats)
# Checks if the page has been updated
page_update(table)

hash_end = function_timer(stats)
time_dif(stats, "Hash time", hash_start, hash_end)

# Use BeautifulSoup4 (bs4)'s find_all method to find all html table rows (tr)
rows = table.find_all("tr")
for row in tqdm(rows):
# Use BeautifulSoup4 (bs4)'s find_all method to find all html tags for table data (td)
td = row.find_all("td")
table_data = []
for actual_data in td:
table_data.append(actual_data.get_text())
data.append(table_data)

date_name = date.today()
file_name = "_" + str(date_name).replace("-", "_") # + "_"

dataframe = pd.DataFrame(data=data, columns=configs.list_header)

dataframe.to_csv(save_dir + configs.department_code + file_name + "_daily_bulletin")
from common.base_scrapers import crimegraphics_scraper

configs = {
"url": "",
"department_code": "",
"list_header": [
"ChargeDescription",
"CaseNum",
"ReportDate",
"OffenseDate",
"Location",
"ChargeDisposition",
],
}

save_dir = "./data/"
data = []

if not os.path.exists(save_dir):
os.makedirs(save_dir)

crimegraphics_scraper(configs, save_dir)

0 comments on commit 47e52f5

Please sign in to comment.