Skip to content

Commit 714455c

Browse files
committed
Update rvdss_historic.py
1 parent 6a002e0 commit 714455c

File tree

1 file changed

+19
-58
lines changed

1 file changed

+19
-58
lines changed

src/acquisition/rvdss/rvdss_historic.py

Lines changed: 19 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from datetime import datetime,timedelta
77
import math
88

9+
from constants import BASHBOARD_BASE_URLS_2023, HISTORIC_SEASON_URL, ALTENRATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR
910
from utils import abbreviate_virus,abbreviate_geo,create_geo_types,check_date_format,get_revised_data,get_weekly_data
1011
#%% Functions
1112

@@ -27,7 +28,7 @@ def append_urls(urls):
2728

2829
http_present = re.search("http:",temp_url)
2930
if not http_present:
30-
urls[i]="https://www.canada.ca"+temp_url
31+
urls[i]=SEASON_BASE_URL+temp_url
3132
else:
3233
urls[i]=re.sub("http:","https:",temp_url)
3334
return(urls)
@@ -36,7 +37,7 @@ def report_urls(soup):
3637
# Get links for individual weeks
3738
year= "-".join(get_report_season(soup))
3839
links=soup.find_all('a')
39-
alternative_url = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/"+year
40+
alternative_url = ALTENRATIVE_SEASON_BASE_URL+year
4041

4142
urls = [link.get("href") for link in links if "ending" in str(link) or
4243
alternative_url in str(link)]
@@ -51,7 +52,7 @@ def report_weeks(soup):
5152
return(weeks)
5253

5354
def get_report_date(week,start_year,epi=False):
54-
if week < 35:
55+
if week < LAST_WEEK_OF_YEAR:
5556
year=int(start_year)+1
5657
else:
5758
year=int(start_year)
@@ -79,14 +80,16 @@ def get_table_captions(soup):
7980
caption = captions[i]
8081

8182
matches = ["period","abbreviation","cumulative", "compared"] #skip historic comparisons and cumulative tables
82-
if any(x in caption.text.lower() for x in matches):
83+
if any(x in caption.text.lower() for x in matches) or caption.has_attr('class') or all(name not in caption.text.lower() for name in table_identifiers):
8384
remove_list.append(caption)
8485

86+
'''
8587
elif caption.has_attr('class'):
8688
remove_list.append(caption)
8789
8890
elif all(name not in caption.text.lower() for name in table_identifiers):
8991
remove_list.append(caption)
92+
'''
9093

9194
new_captions = [cap for cap in captions if cap not in remove_list]
9295
new_captions = list(set(new_captions))
@@ -255,17 +258,6 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu=
255258
geo_types = [create_geo_types(g,"lab") for g in table['geo_value']]
256259
table.insert(3,"geo_type",geo_types)
257260

258-
# Calculate number of positive tests based on pct_positive and total tests
259-
if flu:
260-
table["flu_a_positive_tests"] = (table["flu_a_pct_positive"]/100)*table["flu_tests"]
261-
table["flu_b_positive_tests"] = (table["flu_b_pct_positive"]/100)*table["flu_tests"]
262-
263-
table["flu_positive_tests"] = table["flu_a_positive_tests"] + table["flu_b_positive_tests"]
264-
table["flu_pct_positive"] = (table["flu_positive_tests"]/table["flu_tests"])*100
265-
else:
266-
table[virus+"_positive_tests"] = (table[virus+"_pct_positive"]/100) *table[virus+"_tests"]
267-
268-
269261
table = table.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
270262

271263
return(table)
@@ -291,11 +283,9 @@ def get_season_reports(url):
291283

292284
# Skip empty pages
293285
if season[0] == '2019':
294-
if current_week == 5:
295-
continue
296-
elif current_week == 47:
286+
if current_week == 5 or current_week == 47:
297287
continue
298-
288+
299289
# Get page for the current week
300290
temp_url=urls[week_num]
301291
temp_page=requests.get(temp_url)
@@ -318,15 +308,12 @@ def get_season_reports(url):
318308
if "Positive Adenovirus" in caption.text:
319309
tab.select_one('td').decompose()
320310

321-
if not "number" in caption.text.lower():
322-
# Replace commas with periods
323-
tab = re.sub(",",r".",str(tab))
324-
else:
325-
tab = re.sub(",",r"",str(tab))
326-
311+
# Replace commas with periods
312+
tab2 = re.sub(",",r".",str(tab))
313+
327314
# Read table
328315
na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"N.D.","-"]
329-
table = pd.read_html(tab,na_values=na_values)[0].dropna(how="all")
316+
table = pd.read_html(tab2,na_values=na_values)[0].dropna(how="all")
330317

331318
# Check for multiline headers
332319
if isinstance(table.columns, pd.MultiIndex):
@@ -425,41 +412,15 @@ def get_season_reports(url):
425412
all_number_tables.to_csv(path+"/number_of_detections.csv", index=True)
426413

427414
#%% Scrape each season
428-
429-
urls = ["https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2013-2014.html",
430-
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2014-2015.html",
431-
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2015-2016.html",
432-
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2016-2017.html",
433-
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2017-2018.html",
434-
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2018-2019.html",
435-
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2019-2020.html",
436-
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2020-2021.html",
437-
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2021-2022.html",
438-
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2022-2023.html",
439-
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2023-2024.html"]
440-
441-
[get_season_reports(url) for url in urls]
442-
415+
[get_season_reports(url) for url in HISTORIC_SEASON_URL]
443416

444417
#%% Update the end of the 2023-2024 season with the dashboard data
445-
446-
base_urls=["https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-20/",
447-
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-27/",
448-
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-04/",
449-
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-11/",
450-
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-18/",
451-
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-01/",
452-
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-08/",
453-
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-15/",
454-
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-22/",
455-
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-29/",
456-
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-09-05/"]
457418

458419
# Load old csvs
459-
old_detection_data = pd.read_csv('season_2023_2024/season_2023_2024_respiratory_detections.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
460-
old_positive_data = pd.read_csv('season_2023_2024/season_2023_2024_positive_tests.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
420+
old_detection_data = pd.read_csv('season_2023_2024/respiratory_detections.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
421+
old_positive_data = pd.read_csv('season_2023_2024/positive_tests.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
461422

462-
for base_url in base_urls:
423+
for base_url in BASHBOARD_BASE_URLS_2023:
463424
# Get weekly dashboard data
464425
weekly_data = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
465426
positive_data = get_revised_data(base_url)
@@ -473,6 +434,6 @@ def get_season_reports(url):
473434
old_positive_data= pd.concat([old_positive_data,positive_data],axis=0)
474435

475436
# Overwrite/update csvs
476-
old_detection_data.to_csv('season_2023_2024/season_2023_2024_respiratory_detections.csv',index=True)
477-
old_positive_data.to_csv('season_2023_2024/season_2023_2024_positive_tests.csv',index=True)
437+
old_detection_data.to_csv('season_2023_2024/respiratory_detections.csv',index=True)
438+
old_positive_data.to_csv('season_2023_2024/positive_tests.csv',index=True)
478439

0 commit comments

Comments
 (0)