66from datetime import datetime ,timedelta
77import math
88
9+ from constants import BASHBOARD_BASE_URLS_2023 , HISTORIC_SEASON_URL , ALTENRATIVE_SEASON_BASE_URL , SEASON_BASE_URL , LAST_WEEK_OF_YEAR
910from utils import abbreviate_virus ,abbreviate_geo ,create_geo_types ,check_date_format ,get_revised_data ,get_weekly_data
1011 #%% Functions
1112
@@ -27,7 +28,7 @@ def append_urls(urls):
2728
2829 http_present = re .search ("http:" ,temp_url )
2930 if not http_present :
30- urls [i ]= "https://www.canada.ca" + temp_url
31+ urls [i ]= SEASON_BASE_URL + temp_url
3132 else :
3233 urls [i ]= re .sub ("http:" ,"https:" ,temp_url )
3334 return (urls )
@@ -36,7 +37,7 @@ def report_urls(soup):
3637 # Get links for individual weeks
3738 year = "-" .join (get_report_season (soup ))
3839 links = soup .find_all ('a' )
39- alternative_url = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/" + year
40+ alternative_url = ALTENRATIVE_SEASON_BASE_URL + year
4041
4142 urls = [link .get ("href" ) for link in links if "ending" in str (link ) or
4243 alternative_url in str (link )]
@@ -51,7 +52,7 @@ def report_weeks(soup):
5152 return (weeks )
5253
5354def get_report_date (week ,start_year ,epi = False ):
54- if week < 35 :
55+ if week < LAST_WEEK_OF_YEAR :
5556 year = int (start_year )+ 1
5657 else :
5758 year = int (start_year )
@@ -79,14 +80,16 @@ def get_table_captions(soup):
7980 caption = captions [i ]
8081
8182 matches = ["period" ,"abbreviation" ,"cumulative" , "compared" ] #skip historic comparisons and cumulative tables
82- if any (x in caption .text .lower () for x in matches ):
83+ if any (x in caption .text .lower () for x in matches ) or caption . has_attr ( 'class' ) or all ( name not in caption . text . lower () for name in table_identifiers ) :
8384 remove_list .append (caption )
8485
86+ '''
8587 elif caption.has_attr('class'):
8688 remove_list.append(caption)
8789
8890 elif all(name not in caption.text.lower() for name in table_identifiers):
8991 remove_list.append(caption)
92+ '''
9093
9194 new_captions = [cap for cap in captions if cap not in remove_list ]
9295 new_captions = list (set (new_captions ))
@@ -255,17 +258,6 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu=
255258 geo_types = [create_geo_types (g ,"lab" ) for g in table ['geo_value' ]]
256259 table .insert (3 ,"geo_type" ,geo_types )
257260
258- # Calculate number of positive tests based on pct_positive and total tests
259- if flu :
260- table ["flu_a_positive_tests" ] = (table ["flu_a_pct_positive" ]/ 100 )* table ["flu_tests" ]
261- table ["flu_b_positive_tests" ] = (table ["flu_b_pct_positive" ]/ 100 )* table ["flu_tests" ]
262-
263- table ["flu_positive_tests" ] = table ["flu_a_positive_tests" ] + table ["flu_b_positive_tests" ]
264- table ["flu_pct_positive" ] = (table ["flu_positive_tests" ]/ table ["flu_tests" ])* 100
265- else :
266- table [virus + "_positive_tests" ] = (table [virus + "_pct_positive" ]/ 100 ) * table [virus + "_tests" ]
267-
268-
269261 table = table .set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
270262
271263 return (table )
@@ -291,11 +283,9 @@ def get_season_reports(url):
291283
292284 # Skip empty pages
293285 if season [0 ] == '2019' :
294- if current_week == 5 :
295- continue
296- elif current_week == 47 :
286+ if current_week == 5 or current_week == 47 :
297287 continue
298-
288+
299289 # Get page for the current week
300290 temp_url = urls [week_num ]
301291 temp_page = requests .get (temp_url )
@@ -318,15 +308,12 @@ def get_season_reports(url):
318308 if "Positive Adenovirus" in caption .text :
319309 tab .select_one ('td' ).decompose ()
320310
321- if not "number" in caption .text .lower ():
322- # Replace commas with periods
323- tab = re .sub ("," ,r"." ,str (tab ))
324- else :
325- tab = re .sub ("," ,r"" ,str (tab ))
326-
311+ # Replace commas with periods
312+ tab2 = re .sub ("," ,r"." ,str (tab ))
313+
327314 # Read table
328315 na_values = ['N.A.' ,'N.A' , 'N.C.' ,'N.R.' ,'Not Available' ,'Not Tested' ,"N.D." ,"-" ]
329- table = pd .read_html (tab ,na_values = na_values )[0 ].dropna (how = "all" )
316+ table = pd .read_html (tab2 ,na_values = na_values )[0 ].dropna (how = "all" )
330317
331318 # Check for multiline headers
332319 if isinstance (table .columns , pd .MultiIndex ):
@@ -425,41 +412,15 @@ def get_season_reports(url):
425412 all_number_tables .to_csv (path + "/number_of_detections.csv" , index = True )
426413
427414 #%% Scrape each season
428-
429- urls = ["https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2013-2014.html" ,
430- "https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2014-2015.html" ,
431- "https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2015-2016.html" ,
432- "https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2016-2017.html" ,
433- "https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2017-2018.html" ,
434- "https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2018-2019.html" ,
435- "https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2019-2020.html" ,
436- "https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2020-2021.html" ,
437- "https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2021-2022.html" ,
438- "https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2022-2023.html" ,
439- "https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2023-2024.html" ]
440-
441- [get_season_reports (url ) for url in urls ]
442-
415+ [get_season_reports (url ) for url in HISTORIC_SEASON_URL ]
443416
444417 #%% Update the end of the 2023-2024 season with the dashboard data
445-
446- base_urls = ["https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-20/" ,
447- "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-27/" ,
448- "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-04/" ,
449- "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-11/" ,
450- "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-18/" ,
451- "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-01/" ,
452- "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-08/" ,
453- "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-15/" ,
454- "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-22/" ,
455- "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-29/" ,
456- "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-09-05/" ]
457418
458419# Load old csvs
459- old_detection_data = pd .read_csv ('season_2023_2024/season_2023_2024_respiratory_detections .csv' ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
460- old_positive_data = pd .read_csv ('season_2023_2024/season_2023_2024_positive_tests .csv' ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
420+ old_detection_data = pd .read_csv ('season_2023_2024/respiratory_detections .csv' ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
421+ old_positive_data = pd .read_csv ('season_2023_2024/positive_tests .csv' ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
461422
462- for base_url in base_urls :
423+ for base_url in BASHBOARD_BASE_URLS_2023 :
463424 # Get weekly dashboard data
464425 weekly_data = get_weekly_data (base_url ,2023 ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
465426 positive_data = get_revised_data (base_url )
@@ -473,6 +434,6 @@ def get_season_reports(url):
473434 old_positive_data = pd .concat ([old_positive_data ,positive_data ],axis = 0 )
474435
475436# Overwrite/update csvs
476- old_detection_data .to_csv ('season_2023_2024/season_2023_2024_respiratory_detections .csv' ,index = True )
477- old_positive_data .to_csv ('season_2023_2024/season_2023_2024_positive_tests .csv' ,index = True )
437+ old_detection_data .to_csv ('season_2023_2024/respiratory_detections .csv' ,index = True )
438+ old_positive_data .to_csv ('season_2023_2024/positive_tests .csv' ,index = True )
478439
0 commit comments