Skip to content

Commit 01af95f

Browse files
committed
create utils.py for common functions
1 parent 073aac9 commit 01af95f

File tree

3 files changed

+209
-392
lines changed

3 files changed

+209
-392
lines changed

src/acquisition/rvdss/rvdss_historic.py

Lines changed: 7 additions & 193 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
from epiweeks import Week
66
from datetime import datetime,timedelta
77
import math
8-
import io
98

9+
from utils import abbreviate_virus,abbreviate_geo,create_geo_types,check_date_format,get_revised_data,get_weekly_data
1010
#%% Functions
1111

1212
# Report Functions
@@ -25,16 +25,18 @@ def append_urls(urls):
2525
for i in range(len(urls)):
2626
temp_url = urls[i]
2727

28-
http_present = re.search("http",temp_url)
28+
http_present = re.search("http:",temp_url)
2929
if not http_present:
3030
urls[i]="https://www.canada.ca"+temp_url
31+
else:
32+
urls[i]=re.sub("http:","https:",temp_url)
3133
return(urls)
3234

3335
def report_urls(soup):
3436
# Get links for individual weeks
3537
year= "-".join(get_report_season(soup))
3638
links=soup.find_all('a')
37-
alternative_url = "http://www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/"+year
39+
alternative_url = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/"+year
3840

3941
urls = [link.get("href") for link in links if "ending" in str(link) or
4042
alternative_url in str(link)]
@@ -63,74 +65,7 @@ def get_report_date(week,start_year,epi=False):
6365

6466
return(report_date)
6567

66-
def abbreviate_virus(full_name):
67-
lowercase=full_name.lower()
68-
69-
if any(name in lowercase for name in ["parainfluenza","para","piv"]):
70-
if "hpiv" not in lowercase:
71-
abbrev = re.sub("parainfluenza|para|piv","hpiv",lowercase)
72-
else:
73-
abbrev = lowercase
74-
elif any(name in lowercase for name in ["adenovirus","adeno"]):
75-
abbrev = re.sub("adenovirus|adeno","adv",lowercase)
76-
elif "human metapneumovirus" in lowercase:
77-
abbrev = re.sub("human metapneumovirus","hmpv",lowercase)
78-
elif any(name in lowercase for name in ["enterovirus/rhinovirus","rhinovirus","rhv","entero/rhino","rhino","ev/rv","evrv"]):
79-
abbrev = re.sub("enterovirus/rhinovirus|rhinovirus|rhv|entero/rhino|rhino|ev/rv|evrv","ev_rv",lowercase)
80-
elif any(name in lowercase for name in ["coronavirus","coron","coro"]):
81-
abbrev = re.sub("coronavirus|coron|coro","hcov",lowercase)
82-
elif "respiratory syncytial virus" in lowercase:
83-
abbrev = re.sub("respiratory syncytial virus","rsv",lowercase)
84-
elif "influenza" in lowercase:
85-
abbrev = re.sub("influenza","flu",lowercase)
86-
elif "sarscov2" in lowercase:
87-
abbrev = re.sub("sarscov2","sars-cov-2",lowercase)
88-
else:
89-
abbrev=lowercase
90-
return(abbrev)
91-
92-
def abbreviate_geo(full_name):
93-
lowercase=full_name.lower()
94-
95-
if "newfoundland" in lowercase:
96-
abbrev = "nl"
97-
elif "prince edward island" in lowercase:
98-
abbrev = "pe"
99-
elif "nova scotia" in lowercase:
100-
abbrev = "ns"
101-
elif "new brunswick" in lowercase:
102-
abbrev = "nb"
103-
elif "nova scotia" in lowercase:
104-
abbrev = "ns"
105-
elif re.match('|'.join(("^québec$", "province of québec","quebec")),lowercase):
106-
abbrev = "qc"
107-
elif re.match('|'.join(("^ontario$", "province of ontario")),lowercase):
108-
abbrev = "on"
109-
elif "manitoba" in lowercase:
110-
abbrev = "mb"
111-
elif "saskatchewan" in lowercase:
112-
abbrev = "sk"
113-
elif "alberta" in lowercase:
114-
abbrev = "ab"
115-
elif "british columbia" in lowercase:
116-
abbrev = "bc"
117-
elif "yukon" in lowercase:
118-
abbrev = "yk"
119-
elif "northwest territories" in lowercase:
120-
abbrev = "nt"
121-
elif "nunavut" in lowercase:
122-
abbrev = "nu"
123-
elif re.match("canada|can",lowercase):
124-
abbrev = "ca"
125-
elif re.match(r"^at\b",lowercase):
126-
abbrev = "atlantic"
127-
elif "pr" in lowercase:
128-
abbrev = "prairies"
129-
elif "terr" in lowercase:
130-
abbrev = "territories"
131-
else:
132-
abbrev=lowercase
133-
return(abbrev)
68+
13469

13570

13671
def get_table_captions(soup):
@@ -183,19 +118,6 @@ def get_modified_dates(soup,week_end_date):
183118

184119
return(new_modified_date_string)
185120

186-
def check_date_format(date_string):
187-
if not re.search("[0-9]{4}-[0-9]{2}-[0-9]{2}",date_string):
188-
if re.search(r"/",date_string):
189-
new_date = re.sub(r"/","-",date_string)
190-
new_date = datetime.strptime(new_date,"%d-%m-%Y").strftime("%Y-%m-%d")
191-
elif re.search("[0-9]{2}-[0-9]{2}-[0-9]{4}",date_string):
192-
new_date = datetime.strptime(date_string,"%d-%m-%Y").strftime("%Y-%m-%d")
193-
else:
194-
raise AssertionError("Unrecognised date format")
195-
else:
196-
new_date=date_string
197-
198-
return(new_date)
199121

200122
def check_duplicate_rows(table):
201123
if table['week'].duplicated().any():
@@ -213,18 +135,7 @@ def check_duplicate_rows(table):
213135
new_table=table
214136
return(new_table)
215137

216-
def create_geo_types(geo,default_geo):
217-
regions = ['atlantic','atl','province of québec','québec','qc','province of ontario','ontario','on',
218-
'prairies', 'pr', "british columbia", 'bc',"territories",'terr']
219-
nation = ["canada","can",'ca']
220-
221-
if geo in nation:
222-
geo_type="nation"
223-
elif geo in regions:
224-
geo_type="region"
225-
else:
226-
geo_type = default_geo
227-
return(geo_type)
138+
228139

229140
def create_detections_table(table,modified_date,week_number,week_end_date,start_year):
230141
lab_columns =[col for col in table.columns if 'reporting' in col][0]
@@ -501,103 +412,6 @@ def get_season_reports(url):
501412
if len(all_number_tables) != 0:
502413
all_number_tables.to_csv(path+"/"+path+"_number_of_detections.csv", index=True)
503414

504-
# Dashboard functions
505-
def get_revised_data(base_url):
506-
headers = {
507-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
508-
}
509-
510-
# Get update date
511-
update_date_url = base_url + "RVD_UpdateDate.csv"
512-
update_date_url_response = requests.get(update_date_url, headers=headers)
513-
update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d")
514-
515-
# Get update data
516-
url = base_url+"RVD_WeeklyData.csv"
517-
518-
url_response = requests.get(url, headers=headers)
519-
df = pd.read_csv(io.StringIO(url_response.text))
520-
521-
df['virus'] = [abbreviate_virus(v) for v in df['virus']]
522-
epiw = df.apply(lambda x: Week(x['year'],x['week']),axis=1)
523-
df.insert(0,"epiweek",[int(str(w)) for w in epiw])
524-
df['epiweek'] = [int(str(w)) for w in df['epiweek']]
525-
df['province'] = [abbreviate_geo(g) for g in df['province']]
526-
df=df.rename(columns={'province':"geo_value",'date':'time_value',"detections":"positivetests"})
527-
df['time_value'] = [check_date_format(d) for d in df['time_value']]
528-
df['geo_type'] = [create_geo_types(g,"province") for g in df['geo_value']]
529-
df.insert(1,"issue",update_date)
530-
531-
df=df.drop(["weekorder","region","year","week"],axis=1)
532-
533-
df = df.pivot(index=['epiweek','time_value','issue','geo_type','geo_value'],
534-
columns="virus",values=['tests','percentpositive','positivetests'])
535-
df.columns = ['_'.join(col).strip() for col in df.columns.values]
536-
df = df.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1]))
537-
df.columns=[re.sub("positivetests", "positive_tests",col) for col in df.columns]
538-
df.columns=[re.sub("percentpositive", "pct_positive",col) for col in df.columns]
539-
df.columns=[re.sub(r' ','_',c) for c in df.columns]
540-
541-
for k in range(len(df.columns)):
542-
if "pct_positive" in df.columns[k]:
543-
assert all([0 <= val <= 100 or math.isnan(val) for val in df[df.columns[k]]]), "Percentage not from 0-100"
544-
545-
return(df)
546-
547-
def get_weekly_data(base_url,start_year):
548-
headers = {
549-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
550-
}
551-
552-
# Get update date
553-
update_date_url = base_url + "RVD_UpdateDate.csv"
554-
update_date_url_response = requests.get(update_date_url, headers=headers)
555-
update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d")
556-
557-
# Get current week and year
558-
summary_url = base_url + "RVD_SummaryText.csv"
559-
summary_url_response = requests.get(summary_url, headers=headers)
560-
summary_df = pd.read_csv(io.StringIO(summary_url_response.text))
561-
562-
week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")]
563-
week_string = week_df.iloc[0]['Text'].lower()
564-
current_week = int(re.search("week (.+?) ", week_string).group(1))
565-
566-
if current_week < 34:
567-
current_year = start_year+1
568-
else:
569-
current_year = start_year
570-
571-
current_epiweek= Week(current_year,current_week)
572-
573-
# Get weekly data
574-
weekly_url = base_url + "RVD_CurrentWeekTable.csv"
575-
weekly_url_response = requests.get(weekly_url, headers=headers)
576-
weekly_url_response.encoding='UTF-8'
577-
df_weekly = pd.read_csv(io.StringIO(weekly_url_response.text))
578-
579-
df_weekly = df_weekly.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1]))
580-
df_weekly.insert(0,"epiweek",int(str(current_epiweek)))
581-
df_weekly.insert(1,"time_value",str(current_epiweek.enddate()))
582-
df_weekly.insert(2,"issue",update_date)
583-
df_weekly.columns=[abbreviate_virus(c) for c in df_weekly.columns]
584-
df_weekly.columns=[re.sub(r'test\b','tests',c) for c in df_weekly.columns]
585-
df_weekly.columns=[re.sub(r'pos\b','positive_tests',c) for c in df_weekly.columns]
586-
df_weekly.columns=[re.sub(r'flua_','flu_a',c) for c in df_weekly.columns]
587-
df_weekly.columns=[re.sub(r'flub_','flu_b',c) for c in df_weekly.columns]
588-
df_weekly.columns=[re.sub(r'bpositive','b_positive',c) for c in df_weekly.columns]
589-
df_weekly.columns=[re.sub(r'apositive','a_positive',c) for c in df_weekly.columns]
590-
df_weekly.columns=[re.sub(r'flu_ah1_','flu_ah1pdm09_',c) for c in df_weekly.columns]
591-
df_weekly.columns=[re.sub(r' ','_',c) for c in df_weekly.columns]
592-
df_weekly=df_weekly.rename(columns={'reportinglaboratory':"geo_value"})
593-
df_weekly['geo_value'] = [abbreviate_geo(g) for g in df_weekly['geo_value']]
594-
df_weekly['geo_type'] = [create_geo_types(g,"lab") for g in df_weekly['geo_value']]
595-
596-
#df_weekly=df_weekly.drop(["weekorder","date","week"],axis=1)
597-
598-
return(df_weekly)
599-
600-
601415
#%% Scrape each season
602416

603417
urls = ["https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2013-2014.html",

0 commit comments

Comments
 (0)