55from epiweeks import Week
66from datetime import datetime ,timedelta
77import math
8- import io
98
9+ from utils import abbreviate_virus ,abbreviate_geo ,create_geo_types ,check_date_format ,get_revised_data ,get_weekly_data
1010 #%% Functions
1111
1212 # Report Functions
@@ -25,16 +25,18 @@ def append_urls(urls):
2525 for i in range (len (urls )):
2626 temp_url = urls [i ]
2727
28- http_present = re .search ("http" ,temp_url )
28+ http_present = re .search ("http: " ,temp_url )
2929 if not http_present :
3030 urls [i ]= "https://www.canada.ca" + temp_url
31+ else :
32+ urls [i ]= re .sub ("http:" ,"https:" ,temp_url )
3133 return (urls )
3234
3335def report_urls (soup ):
3436 # Get links for individual weeks
3537 year = "-" .join (get_report_season (soup ))
3638 links = soup .find_all ('a' )
37- alternative_url = "http:// www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/" + year
39+ alternative_url = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/" + year
3840
3941 urls = [link .get ("href" ) for link in links if "ending" in str (link ) or
4042 alternative_url in str (link )]
@@ -63,74 +65,7 @@ def get_report_date(week,start_year,epi=False):
6365
6466 return (report_date )
6567
66- def abbreviate_virus (full_name ):
67- lowercase = full_name .lower ()
68-
69- if any (name in lowercase for name in ["parainfluenza" ,"para" ,"piv" ]):
70- if "hpiv" not in lowercase :
71- abbrev = re .sub ("parainfluenza|para|piv" ,"hpiv" ,lowercase )
72- else :
73- abbrev = lowercase
74- elif any (name in lowercase for name in ["adenovirus" ,"adeno" ]):
75- abbrev = re .sub ("adenovirus|adeno" ,"adv" ,lowercase )
76- elif "human metapneumovirus" in lowercase :
77- abbrev = re .sub ("human metapneumovirus" ,"hmpv" ,lowercase )
78- elif any (name in lowercase for name in ["enterovirus/rhinovirus" ,"rhinovirus" ,"rhv" ,"entero/rhino" ,"rhino" ,"ev/rv" ,"evrv" ]):
79- abbrev = re .sub ("enterovirus/rhinovirus|rhinovirus|rhv|entero/rhino|rhino|ev/rv|evrv" ,"ev_rv" ,lowercase )
80- elif any (name in lowercase for name in ["coronavirus" ,"coron" ,"coro" ]):
81- abbrev = re .sub ("coronavirus|coron|coro" ,"hcov" ,lowercase )
82- elif "respiratory syncytial virus" in lowercase :
83- abbrev = re .sub ("respiratory syncytial virus" ,"rsv" ,lowercase )
84- elif "influenza" in lowercase :
85- abbrev = re .sub ("influenza" ,"flu" ,lowercase )
86- elif "sarscov2" in lowercase :
87- abbrev = re .sub ("sarscov2" ,"sars-cov-2" ,lowercase )
88- else :
89- abbrev = lowercase
90- return (abbrev )
91-
92- def abbreviate_geo (full_name ):
93- lowercase = full_name .lower ()
94-
95- if "newfoundland" in lowercase :
96- abbrev = "nl"
97- elif "prince edward island" in lowercase :
98- abbrev = "pe"
99- elif "nova scotia" in lowercase :
100- abbrev = "ns"
101- elif "new brunswick" in lowercase :
102- abbrev = "nb"
103- elif "nova scotia" in lowercase :
104- abbrev = "ns"
105- elif re .match ('|' .join (("^québec$" , "province of québec" ,"quebec" )),lowercase ):
106- abbrev = "qc"
107- elif re .match ('|' .join (("^ontario$" , "province of ontario" )),lowercase ):
108- abbrev = "on"
109- elif "manitoba" in lowercase :
110- abbrev = "mb"
111- elif "saskatchewan" in lowercase :
112- abbrev = "sk"
113- elif "alberta" in lowercase :
114- abbrev = "ab"
115- elif "british columbia" in lowercase :
116- abbrev = "bc"
117- elif "yukon" in lowercase :
118- abbrev = "yk"
119- elif "northwest territories" in lowercase :
120- abbrev = "nt"
121- elif "nunavut" in lowercase :
122- abbrev = "nu"
123- elif re .match ("canada|can" ,lowercase ):
124- abbrev = "ca"
125- elif re .match (r"^at\b" ,lowercase ):
126- abbrev = "atlantic"
127- elif "pr" in lowercase :
128- abbrev = "prairies"
129- elif "terr" in lowercase :
130- abbrev = "territories"
131- else :
132- abbrev = lowercase
133- return (abbrev )
68+
13469
13570
13671def get_table_captions (soup ):
@@ -183,19 +118,6 @@ def get_modified_dates(soup,week_end_date):
183118
184119 return (new_modified_date_string )
185120
186- def check_date_format (date_string ):
187- if not re .search ("[0-9]{4}-[0-9]{2}-[0-9]{2}" ,date_string ):
188- if re .search (r"/" ,date_string ):
189- new_date = re .sub (r"/" ,"-" ,date_string )
190- new_date = datetime .strptime (new_date ,"%d-%m-%Y" ).strftime ("%Y-%m-%d" )
191- elif re .search ("[0-9]{2}-[0-9]{2}-[0-9]{4}" ,date_string ):
192- new_date = datetime .strptime (date_string ,"%d-%m-%Y" ).strftime ("%Y-%m-%d" )
193- else :
194- raise AssertionError ("Unrecognised date format" )
195- else :
196- new_date = date_string
197-
198- return (new_date )
199121
200122def check_duplicate_rows (table ):
201123 if table ['week' ].duplicated ().any ():
@@ -213,18 +135,7 @@ def check_duplicate_rows(table):
213135 new_table = table
214136 return (new_table )
215137
216- def create_geo_types (geo ,default_geo ):
217- regions = ['atlantic' ,'atl' ,'province of québec' ,'québec' ,'qc' ,'province of ontario' ,'ontario' ,'on' ,
218- 'prairies' , 'pr' , "british columbia" , 'bc' ,"territories" ,'terr' ]
219- nation = ["canada" ,"can" ,'ca' ]
220-
221- if geo in nation :
222- geo_type = "nation"
223- elif geo in regions :
224- geo_type = "region"
225- else :
226- geo_type = default_geo
227- return (geo_type )
138+
228139
229140def create_detections_table (table ,modified_date ,week_number ,week_end_date ,start_year ):
230141 lab_columns = [col for col in table .columns if 'reporting' in col ][0 ]
@@ -501,103 +412,6 @@ def get_season_reports(url):
501412 if len (all_number_tables ) != 0 :
502413 all_number_tables .to_csv (path + "/" + path + "_number_of_detections.csv" , index = True )
503414
504- # Dashboard functions
505- def get_revised_data (base_url ):
506- headers = {
507- 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
508- }
509-
510- # Get update date
511- update_date_url = base_url + "RVD_UpdateDate.csv"
512- update_date_url_response = requests .get (update_date_url , headers = headers )
513- update_date = datetime .strptime (update_date_url_response .text ,"%m/%d/%Y %H:%M:%S" ).strftime ("%Y-%m-%d" )
514-
515- # Get update data
516- url = base_url + "RVD_WeeklyData.csv"
517-
518- url_response = requests .get (url , headers = headers )
519- df = pd .read_csv (io .StringIO (url_response .text ))
520-
521- df ['virus' ] = [abbreviate_virus (v ) for v in df ['virus' ]]
522- epiw = df .apply (lambda x : Week (x ['year' ],x ['week' ]),axis = 1 )
523- df .insert (0 ,"epiweek" ,[int (str (w )) for w in epiw ])
524- df ['epiweek' ] = [int (str (w )) for w in df ['epiweek' ]]
525- df ['province' ] = [abbreviate_geo (g ) for g in df ['province' ]]
526- df = df .rename (columns = {'province' :"geo_value" ,'date' :'time_value' ,"detections" :"positivetests" })
527- df ['time_value' ] = [check_date_format (d ) for d in df ['time_value' ]]
528- df ['geo_type' ] = [create_geo_types (g ,"province" ) for g in df ['geo_value' ]]
529- df .insert (1 ,"issue" ,update_date )
530-
531- df = df .drop (["weekorder" ,"region" ,"year" ,"week" ],axis = 1 )
532-
533- df = df .pivot (index = ['epiweek' ,'time_value' ,'issue' ,'geo_type' ,'geo_value' ],
534- columns = "virus" ,values = ['tests' ,'percentpositive' ,'positivetests' ])
535- df .columns = ['_' .join (col ).strip () for col in df .columns .values ]
536- df = df .rename (columns = lambda x : '_' .join (x .split ('_' )[1 :]+ x .split ('_' )[:1 ]))
537- df .columns = [re .sub ("positivetests" , "positive_tests" ,col ) for col in df .columns ]
538- df .columns = [re .sub ("percentpositive" , "pct_positive" ,col ) for col in df .columns ]
539- df .columns = [re .sub (r' ' ,'_' ,c ) for c in df .columns ]
540-
541- for k in range (len (df .columns )):
542- if "pct_positive" in df .columns [k ]:
543- assert all ([0 <= val <= 100 or math .isnan (val ) for val in df [df .columns [k ]]]), "Percentage not from 0-100"
544-
545- return (df )
546-
547- def get_weekly_data (base_url ,start_year ):
548- headers = {
549- 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
550- }
551-
552- # Get update date
553- update_date_url = base_url + "RVD_UpdateDate.csv"
554- update_date_url_response = requests .get (update_date_url , headers = headers )
555- update_date = datetime .strptime (update_date_url_response .text ,"%m/%d/%Y %H:%M:%S" ).strftime ("%Y-%m-%d" )
556-
557- # Get current week and year
558- summary_url = base_url + "RVD_SummaryText.csv"
559- summary_url_response = requests .get (summary_url , headers = headers )
560- summary_df = pd .read_csv (io .StringIO (summary_url_response .text ))
561-
562- week_df = summary_df [(summary_df ['Section' ] == "summary" ) & (summary_df ['Type' ]== "title" )]
563- week_string = week_df .iloc [0 ]['Text' ].lower ()
564- current_week = int (re .search ("week (.+?) " , week_string ).group (1 ))
565-
566- if current_week < 34 :
567- current_year = start_year + 1
568- else :
569- current_year = start_year
570-
571- current_epiweek = Week (current_year ,current_week )
572-
573- # Get weekly data
574- weekly_url = base_url + "RVD_CurrentWeekTable.csv"
575- weekly_url_response = requests .get (weekly_url , headers = headers )
576- weekly_url_response .encoding = 'UTF-8'
577- df_weekly = pd .read_csv (io .StringIO (weekly_url_response .text ))
578-
579- df_weekly = df_weekly .rename (columns = lambda x : '_' .join (x .split ('_' )[1 :]+ x .split ('_' )[:1 ]))
580- df_weekly .insert (0 ,"epiweek" ,int (str (current_epiweek )))
581- df_weekly .insert (1 ,"time_value" ,str (current_epiweek .enddate ()))
582- df_weekly .insert (2 ,"issue" ,update_date )
583- df_weekly .columns = [abbreviate_virus (c ) for c in df_weekly .columns ]
584- df_weekly .columns = [re .sub (r'test\b' ,'tests' ,c ) for c in df_weekly .columns ]
585- df_weekly .columns = [re .sub (r'pos\b' ,'positive_tests' ,c ) for c in df_weekly .columns ]
586- df_weekly .columns = [re .sub (r'flua_' ,'flu_a' ,c ) for c in df_weekly .columns ]
587- df_weekly .columns = [re .sub (r'flub_' ,'flu_b' ,c ) for c in df_weekly .columns ]
588- df_weekly .columns = [re .sub (r'bpositive' ,'b_positive' ,c ) for c in df_weekly .columns ]
589- df_weekly .columns = [re .sub (r'apositive' ,'a_positive' ,c ) for c in df_weekly .columns ]
590- df_weekly .columns = [re .sub (r'flu_ah1_' ,'flu_ah1pdm09_' ,c ) for c in df_weekly .columns ]
591- df_weekly .columns = [re .sub (r' ' ,'_' ,c ) for c in df_weekly .columns ]
592- df_weekly = df_weekly .rename (columns = {'reportinglaboratory' :"geo_value" })
593- df_weekly ['geo_value' ] = [abbreviate_geo (g ) for g in df_weekly ['geo_value' ]]
594- df_weekly ['geo_type' ] = [create_geo_types (g ,"lab" ) for g in df_weekly ['geo_value' ]]
595-
596- #df_weekly=df_weekly.drop(["weekorder","date","week"],axis=1)
597-
598- return (df_weekly )
599-
600-
601415 #%% Scrape each season
602416
603417urls = ["https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2013-2014.html" ,
0 commit comments