-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis_inpat.R
362 lines (280 loc) · 10.6 KB
/
analysis_inpat.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
library(dplyr)
library(ggplot2)
library(here)
FILE = "COVID_1_SLH.tab"
# onedrive --> files / covid-19 / covid_datathon / data / slh_data
stluke_lab = read.csv(here('data', '2020-07-31', FILE), sep="\t", stringsAsFactors = FALSE)
cat('Raw input-----\n')
stluke_lab %>% select(PROC_NAME, CURRENT_ICD10_LIST, ORD_VALUE, NAME) %>% head(n=20)
## NOTED_DATE is first date a CC was noted by the provider
## NOTED DATE is specific to Problem List!!
## covid_1_slh is somewhat joined up.
## about 50 pts per TAB file
stluke_lab %>%
filter(PAT_CLASS == "Outpatient") ->
out_only
# What is "outpatient":
# still is really outpatient, outpatient test xray us etc. monitoring etc.
# endoscopy, outpat cath, outpat ed.
cat('distrib of patient class----\n')
table(stluke_lab$ADT_PAT_CLASS_C)
table(stluke_lab$PAT_CLASS)
# 101 102 103 104 106 129
#422644 12248 54964 23553 5910 5436
# Very good that Rory mapped adt_pat_class_c to pat_class and also kept both.
# Emergency
# Hospital Outpatient Surgery
# Inpatient
# Inpatient Rehab
# Observation
# Outpatient
######## NOTES
# never been done - TODO -->
# 6. inpatient: testing late in admission: rate of this, rate of positives.
# cool to have ER data.
# My process:
# (1) Understand Rory's process; Plus Explore the .tab files & understand.
# (2) Analyses according to thoughts.pdf. Do what's feasible.
## Data Pull Process is as follows:
## took lab for covid
## find all pts who had those lab
## randomize
## took first 150 pat ids
## ran those pat ids over whole hosp record
# Next pull:
# hosp admission disch dx
# pulse ox
# maaaaybe the order date vs result date
# totally okay to NOT join admissions with the problem list.
# Where will datathon people get their data dictionaries? (Gloria has it: need just a BCM ECA to log in/)
# Are people going to want to download and do locally: deident???
stluke_lab %>%
select(PAT_ID, ORDERING_DATE, PROC_NAME, NAME, ORD_VALUE, ORD_NUM_VALUE, PAT_CLASS) %>%
filter(ORD_VALUE != "NULL" | ORD_NUM_VALUE != "NULL") %>%
distinct() %>%
arrange(PAT_ID, ORDERING_DATE) -> # TODO - maybe should get rid of this arrange(). Maybe select ORDER_PROC_ID too.
lab_no_join
# Thoughts about future stuff to pull:
# people who touch chart
# specimen collect TIME not just date
# mortality !! pat_enc_hosp, also "care name"
# emp id versus "ser" which is about role / job description
#### 2020-08-07 ####
#### Analyze problem list diagnoses ####
stluke_lab %>%
select(PAT_ID, DX_NAME, CURRENT_ICD10_LIST) %>%
distinct() %>%
arrange(PAT_ID, CURRENT_ICD10_LIST) ->
dx
# NOTED_DATE is sometimes == "NULL"
# but nonetheless, later we should exclude ones that are to recent
# ^^^^^ TODO
cat('patients by count of diagnoses-----\n')
dx %>% count(PAT_ID) %>% arrange(desc(n)) # output - who's "sickest"
dx %>%
group_by(PAT_ID) %>% ## REUSE
summarise(dm = sum(grepl("diab", DX_NAME, ignore.case = TRUE)),
asthma = sum(grepl("asth", DX_NAME, ignore.case = TRUE)),
copd = sum(grepl("copd", DX_NAME, ignore.case = TRUE)),) ->
comorb_count
# TODO - hypertension
#### filter & standardize covid results
#table(cov_lab$ORD_VALUE)
# Detected Negative Not Detected Positive
# 6 27 24 3
lab_no_join %>%
filter(grepl("cov", NAME, ignore.case = TRUE) & ! grepl("performing", NAME, ignore.case = TRUE)) %>%
select(-ORD_NUM_VALUE, -PROC_NAME) %>%
mutate(cov_result = case_when(
ORD_VALUE == "Negative" | ORD_VALUE == "Not Detected" ~ 0,
ORD_VALUE == "Positive" | ORD_VALUE == "Detected" ~ 1
)) ->
cov_lab_tall
# 60 rows for about 50 pts, if we use "tall"
# Inspect these because we're about to lose them in a summarise()
cat('Tests maching string cov, vals of NAME and PAT_CLASS----\n')
table(cov_lab_tall$NAME) # yes all 60 are the same
table(cov_lab_tall$PAT_CLASS) # ER 9, inp 33, obs 3, outp 15
cov_lab_tall %>%
group_by(PAT_ID) %>%
summarise(n_pos = sum(cov_result), n_done = n()) %>%
mutate (proportion_pos = n_pos / n_done) %>%
arrange(desc(proportion_pos)) ->
cov_lab
# only one has P = 0.5
#### finally join diagnoses with covid results!
#> dim(cov_lab)
#[1] 45 4
#> dim(comorb_count)
#[1] 49 2
# TODO why is this: some pts w/o covid test??
comorb_count %>%
full_join(cov_lab) %>%
mutate(covid_boolean = (proportion_pos > 0),
dm_boolean = (dm>0),
asthma_boolean = (asthma>0),
copd_boolean = (copd>0)
)->
comorb_covid ## A nice and neat table for later
#### explore asthma copd htn
# the code below looks nicer than table()
dx %>%
select(DX_NAME) %>%
group_by(DX_NAME) %>%
summarise(n=n()) %>%
arrange(desc(n)) ->
counts_dx
cat('\n\nexplore string matching of dx_name----\n')
counts_dx %>% filter(grepl("copd", DX_NAME, ignore.case = TRUE))
counts_dx %>% filter(grepl("obstr", DX_NAME, ignore.case = TRUE)) # not needed
counts_dx %>% filter(grepl("asth", DX_NAME, ignore.case = TRUE))
counts_dx %>% filter(grepl("hypert", DX_NAME, ignore.case = TRUE)) # filter out pulm htn
counts_dx %>% filter(grepl("htn", DX_NAME, ignore.case = TRUE)) # not needed
#### Analyses
# 2 x 2 tables
dm_cov_table = with(comorb_covid, table(covid_boolean, dm_boolean))
# 4:5 ratio diabetics had + covid
# 4:32 ratio non-dm had + covid
Fd = fisher.test(dm_cov_table)
# p-value = 0.03886
# OR = 6.040232 (0.8439568 45.8211413)
asth_cov_table = with(comorb_covid, table(covid_boolean, asthma_boolean))
Fa = fisher.test(asth_cov_table)
# -ast= 36:7, +ast= 1:1
copd_cov_table = with(comorb_covid, table(covid_boolean, copd_boolean))
Fc = fisher.test(copd_cov_table)
# -copd= 34:8, +copd= 3:0
####print stuff
cat('\n\noutputs----\n')
dm_cov_table
Fd
asth_cov_table
Fa
copd_cov_table
Fc
#### plots
# fishers = rbind(Fd, Fa, Fc) # matrix
fishers = list(Fd, Fa, Fc) # for some reason, need [[1]] ??
df = data.frame()
for (i in seq(3)){
temp = data.frame(est = fishers[i][[1]]$estimate,
lower = fishers[i][[1]]$conf.int[1],
upper = fishers[i][[1]]$conf.int[2],
comorb_longname = fishers[i][[1]]$data.name
)
df = rbind(df, temp)
}
df %>%
mutate(comorb_name = case_when(comorb_longname=='dm_cov_table'~'Diabetes',
comorb_longname=='asth_cov_table'~'Asthma',
comorb_longname=='copd_cov_table'~'COPD')) ->
df
odds_plot = ggplot(df, aes(comorb_name, est)) +
geom_pointrange(aes(ymin = lower, ymax = upper)) +
scale_y_log10() +
labs(x = 'Comorbidity', y='Odds ratio', title='Odds of COVID +:- in disease vs. not', subtitle='Inpatient/BSLMC') +
geom_hline(yintercept = 1)
######## Oxygenation, univariate/descriptive ########
# still TODO -->
# * inpatient: testing late in admission: rate of this, rate of positives.
# * rate of admissions/ER , by risk factors
# couple ?s for Rory
# Highest priorities for future data pulls
# in hospital mortality
# pulse ox <-- Would be good, but wait til further notice. (NB we have CPO order status)
# in absence of SpO2: get ABG & VBG.
# For future ref: FIO2 needed too for proper interp.
# hosp admission disch dx
# specimen collect time
# EMP_ID and/or SER
# proc_name = "abg" or "blood gas" or...... "venous blood gas" or.......
# NAME all of : {pH, pCO2, pO2, SaO2, FIO2 !!!!!!, L/min !!!!!, HCO3_art, CO2, base_excess}
# ^^^^ ^^^^ !!!!
# proc_name = blood gas, arterial
cat('\n\nWhat are ABG components----\n')
lab_no_join %>% filter(PROC_NAME == "BLOOD GAS, ARTERIAL") %>% select(NAME) %>% group_by(NAME) %>% summarise(n())
lab_no_join %>%
filter(PROC_NAME == "BLOOD GAS, ARTERIAL", NAME == "FIO2 (BEAKER)") %>%
mutate(FIO2 = as.numeric(ORD_NUM_VALUE)) %>%
select(PAT_ID, ORDERING_DATE, FIO2) ->
fio2s
cat('\n\nSome pts have mult ABGs per date----\n')
fio2s %>%
group_by(PAT_ID, ORDERING_DATE) %>%
summarise(n=n()) %>%
arrange(desc(n))
######## Oxygenation, both pO2 and FIO2 ########
## TODO - this looks like a job for... tidyr! But need to upgrade R.
lab_no_join %>%
filter(PROC_NAME == "BLOOD GAS, ARTERIAL") %>%
head()
# found out order_proc_id is the magic key that you want to group them
cat('order_proc_id is good----\n')
stluke_lab %>% filter(PROC_NAME == "BLOOD GAS, ARTERIAL" & (NAME == "FIO2 (BEAKER)" | NAME =='PO2 ARTERIAL (BEAKER)')) %>% select(ORDER_PROC_ID, PAT_ID, ORDERING_DATE, PROC_ID, NAME) %>% unique() %>% head(20)
stluke_lab %>%
filter(NAME == "FIO2 (BEAKER)" | NAME =='PO2 ARTERIAL (BEAKER)') %>%
select(ORDER_PROC_ID, PAT_ID, ORDERING_DATE, NAME, ORD_NUM_VALUE) %>%
distinct() ->
oxygenation
# here is where I usually would do tidyr
oxygenation %>%
filter(NAME == "FIO2 (BEAKER)" ) %>%
mutate(fio2 = as.numeric(ORD_NUM_VALUE)) %>%
select(-ORD_NUM_VALUE, -NAME) ->
fio2_tbj
oxygenation %>%
filter(NAME =='PO2 ARTERIAL (BEAKER)') %>%
mutate(po2 = as.numeric(ORD_NUM_VALUE)) %>%
select(ORDER_PROC_ID, po2) %>%
full_join(fio2_tbj, by = "ORDER_PROC_ID") %>%
mutate(pfr = 100 * po2 / fio2,
category = case_when(
pfr < 200 ~ 'ARDS',
(pfr < 300) & (pfr >= 200) ~ 'ALI',
pfr >= 300 ~ 'Normal'
)
) ->
oxy_joined
# todo exclude old abgs. i got 3 from 2014. rest = 2020.
# TODO - How many of the patients in the dataset *lack* ABGs?
# i.e. Get idea of how bad confound by test indication can be.
######## Oxygenation vs COVID test ########
cat('tests----\n')
comorb_covid %>%
select(PAT_ID, covid_boolean, dm_boolean, asthma_boolean, copd_boolean) %>%
full_join(oxy_joined) %>%
filter(!is.na(covid_boolean)) -> # todo count how many na
covid_comorb_oxy
ggplot(covid_comorb_oxy, aes(fio2, po2, color=category, shape=covid_boolean)) +
geom_point() +
labs(title='Oxygenation severity vs. COVID status', shape='COVID', color='Severity', subtitle='Inpatient/BSLMC')->
ards_scatter_shape
# po2
ggplot(covid_comorb_oxy, aes(covid_boolean, po2)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width=0.2) +
labs(title='Arterial oxygen vs. COVID status', x='COVID test result', subtitle='Inpatient/BSLMC')->
po2_covid_box
wilcox.test(covid_comorb_oxy[covid_comorb_oxy$covid_boolean == FALSE, ]$po2,
covid_comorb_oxy[covid_comorb_oxy$covid_boolean == TRUE, ]$po2)
# p-value = 1.187e-05 , no surprise
# pfr
ggplot(covid_comorb_oxy, aes(covid_boolean, pfr)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width=0.2) +
labs(title='PO2:FIO2 ratio vs. COVID status', x='COVID test result', y='P:F ratio', subtitle='Inpatient/BSLMC') ->
pfr_covid_box
wilcox.test(covid_comorb_oxy[covid_comorb_oxy$covid_boolean == FALSE, ]$pfr,
covid_comorb_oxy[covid_comorb_oxy$covid_boolean == TRUE, ]$pfr)
# p-value = 3.161e-05 , also no surprise
pdf("Rplots_inpat.pdf")
odds_plot
ards_scatter_shape
po2_covid_box
pfr_covid_box
dev.off()
# here('pngs', '
ggsave(here('pngs', 'fig4-odds-vs-comorb.png'), plot=odds_plot)
ggsave(here('pngs', 'fig5-ards-vs-result.png'), plot=ards_scatter_shape)
ggsave(here('pngs', 'fig6-po2-vs-result.png'), plot=po2_covid_box)
ggsave(here('pngs', 'fig7-pfr-vs-result.png'), plot=pfr_covid_box)