Skip to content

Commit b0988b4

Browse files
authored
Merge pull request #281 from cmu-delphi/r-pkg-devel
Rollup of R package changes
2 parents 2547d3d + 83501c0 commit b0988b4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+576
-494
lines changed

R-packages/covidcast/R/utils.R

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ latest_issue <- function(df) {
1515
attrs <- attrs[!(names(attrs) %in% c("row.names", "names"))]
1616

1717
df <- df %>%
18-
dplyr::group_by(.data$geo_value, .data$time_value) %>%
19-
dplyr::filter(.data$issue == max(.data$issue)) %>%
20-
dplyr::ungroup()
18+
dplyr::arrange(dplyr::desc(.data$issue)) %>%
19+
dplyr::distinct(.data$geo_value, .data$time_value,
20+
.keep_all = TRUE)
2121

2222
attributes(df) <- c(attributes(df), attrs)
2323

@@ -41,9 +41,9 @@ earliest_issue <- function(df) {
4141
attrs <- attrs[!(names(attrs) %in% c("row.names", "names"))]
4242

4343
df <- df %>%
44-
dplyr::group_by(.data$geo_value, .data$time_value) %>%
45-
dplyr::filter(.data$issue == min(.data$issue)) %>%
46-
dplyr::ungroup()
44+
dplyr::arrange(.data$issue) %>%
45+
dplyr::distinct(.data$geo_value, .data$time_value,
46+
.keep_all = TRUE)
4747

4848
attributes(df) <- c(attributes(df), attrs)
4949

R-packages/covidcast/_pkgdown.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,23 @@ home:
99
- text: View the COVIDcast map
1010
href: https://covidcast.cmu.edu/
1111

12+
articles:
13+
- title: Using the package
14+
desc: Basic usage and examples.
15+
navbar: ~
16+
contents:
17+
- covidcast
18+
- plotting-signals
19+
- correlation-utils
20+
- multi-signals
21+
22+
repo:
23+
url:
24+
home: https://github.com/cmu-delphi/covidcast/tree/main/R-packages/covidcast
25+
source: https://github.com/cmu-delphi/covidcast/blob/main/R-packages/covidcast/
26+
issue: https://github.com/cmu-delphi/covidcast/issues
27+
user: https://github.com/
28+
1229
reference:
1330
- title: Fetch data
1431
desc: Fetch signals and metadata from the COVIDcast API
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Internal utility functions.
2+
3+
test_that("latest_issue gives only the latest issue", {
4+
foo <- data.frame(
5+
geo_value = c(rep("pa", 3), rep("tx", 3)),
6+
issue = c(3, 2, 1, 1, 2, 3),
7+
time_value = 1,
8+
value = c(4, 5, 6, 7, 8, 9))
9+
10+
latest <- data.frame(
11+
geo_value = c("pa", "tx"),
12+
issue = 3,
13+
time_value = 1,
14+
value = c(4, 9))
15+
16+
expect_equal(latest_issue(foo), latest)
17+
})
18+
19+
test_that("earliest_issue gives only the earliest issue", {
20+
foo <- data.frame(
21+
geo_value = c(rep("pa", 3), rep("tx", 3)),
22+
issue = c(3, 2, 1, 1, 2, 3),
23+
time_value = 1,
24+
value = c(4, 5, 6, 7, 8, 9))
25+
26+
earliest <- data.frame(
27+
geo_value = c("pa", "tx"),
28+
issue = 1,
29+
time_value = 1,
30+
value = c(6, 7))
31+
32+
expect_equal(earliest_issue(foo), earliest)
33+
})
Lines changed: 54 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
---
2-
title: Correlation utilities
2+
title: 2. Computing signal correlations
3+
description: Calculate correlations over space and time between multiple signals.
34
output: rmarkdown::html_vignette
45
vignette: >
5-
%\VignetteIndexEntry{2. Correlation utilities}
6+
%\VignetteIndexEntry{2. Computing signal correlations}
67
%\VignetteEngine{knitr::rmarkdown}
78
%\VignetteEncoding{UTF-8}
89
---
@@ -11,9 +12,8 @@ The covidcast package provides some simple utilities for exploring the
1112
correlations between two signals, over space or time, which may be helpful for
1213
simple analyses and explorations of data.
1314

14-
For these examples, we'll load confirmed cases and deaths to compare against,
15-
and restrict our analysis to counties with at least 500 total cases by August
16-
15th.
15+
For these examples, we'll load confirmed case and death rates. and restrict our
16+
analysis to counties with at least 500 total cases by August 15th.
1717

1818
```{r, message = FALSE}
1919
library(covidcast)
@@ -22,27 +22,30 @@ library(dplyr)
2222
start_day <- "2020-03-01"
2323
end_day <- "2020-08-15"
2424
25-
inum <- suppressMessages(
25+
iprop <- suppressMessages(
2626
covidcast_signal(data_source = "jhu-csse",
27-
signal = "confirmed_7dav_incidence_num",
27+
signal = "confirmed_7dav_incidence_prop",
2828
start_day = start_day, end_day = end_day)
2929
)
30-
summary(inum)
30+
summary(iprop)
3131
32-
dnum <- suppressMessages(
32+
dprop <- suppressMessages(
3333
covidcast_signal(data_source = "jhu-csse",
34-
signal = "deaths_7dav_incidence_num",
34+
signal = "deaths_7dav_incidence_prop",
3535
start_day = start_day, end_day = end_day)
3636
)
37-
summary(dnum)
37+
summary(dprop)
3838
3939
# Restrict attention to "active" counties with at least 500 total cases
4040
case_num <- 500
41-
geo_values <- inum %>% group_by(geo_value) %>%
42-
summarize(total = sum(value)) %>%
43-
filter(total >= case_num) %>% pull(geo_value)
44-
inum_act <- inum %>% filter(geo_value %in% geo_values)
45-
dnum_act <- dnum %>% filter(geo_value %in% geo_values)
41+
geo_values <- suppressMessages(
42+
covidcast_signal(data_source = "jhu-csse",
43+
signal = "confirmed_cumulative_num",
44+
start_day = end_day, end_day = end_day) %>%
45+
filter(value >= case_num) %>% pull(geo_value)
46+
)
47+
iprop_act <- iprop %>% filter(geo_value %in% geo_values)
48+
dprop_act <- dprop %>% filter(geo_value %in% geo_values)
4649
```
4750

4851
## Correlations sliced by time
@@ -60,91 +63,72 @@ by setting `by = "time_value"`:
6063
library(ggplot2)
6164
6265
# Compute correlation per time, over all counties
63-
df_cor1 <- covidcast_cor(inum_act, dnum_act, by = "time_value")
66+
df_cor <- covidcast_cor(iprop_act, dprop_act, by = "time_value")
6467
6568
# Plot the correlation time series
66-
ggplot(df_cor1, aes(x = time_value, y = value)) + geom_line() +
67-
labs(title = "Correlation between cases and deaths",
69+
ggplot(df_cor, aes(x = time_value, y = value)) + geom_line() +
70+
labs(title = "Correlation between case and death rates",
6871
subtitle = sprintf("Per day, over counties with at least %i cases",
6972
case_num),
7073
x = "Date", y = "Correlation")
7174
```
72-
73-
(The sudden drop on July 25th is due to a [sudden change in how New Jersey
74-
reported deaths](https://github.com/CSSEGISandData/COVID-19/issues/2763) being
75-
reflected in our data source as large outliers; since the signal is a 7-day
76-
average, these outliers last until the beginning of July and affect the reported
77-
correlation.)
78-
79-
We might also be interested in how cases now correlate with deaths in the
80-
*future*. Using the `dt_x` parameter, we can lag cases back 10 days in time,
81-
before calculating correlations:
82-
83-
```{r, warning = FALSE}
84-
# Same, but now lag incidence case numbers back 10 days in time
85-
df_cor2 <- covidcast_cor(inum_act, dnum_act, by = "time_value", dt_x = -10)
86-
87-
# Stack rowwise into one data frame, then plot time series
88-
df_cor <- rbind(df_cor1, df_cor2)
89-
df_cor$dt <- as.factor(c(rep(0, nrow(df_cor1)), rep(-10, nrow(df_cor2))))
90-
ggplot(df_cor, aes(x = time_value, y = value)) +
91-
geom_line(aes(color = dt)) +
92-
labs(title = "Correlation between cases and deaths",
93-
subtitle = sprintf("Per day, over counties with at least %i cases",
94-
case_num),
95-
x = "Date", y = "Correlation") +
96-
theme(legend.position = "bottom")
97-
```
98-
99-
We can see that, for the most part, lagging the cases time series back by 10
100-
days improves correlations, showing that cases are better correlated with deaths
101-
10 days from now.
102-
103-
We can also look at Spearman (rank) correlation, which is a more robust measure
104-
of correlation: it's invariant to monotone transformations, and doesn't rely on
105-
any particular functional form for the dependence between two variables.
75+
76+
The above plot addresses the question: "on any given day, are case and death
77+
rates linearly associated, over US counties?". We might be interested in
78+
broadening this question, instead asking: "on any given day, do higher case
79+
rates tend to associate with higher death rates?", removing the dependence on a
80+
linear relationship. The latter can be addressed using Spearman correlation,
81+
accomplished by setting `method = "spearman"` in the call to `covidcast_cor()`.
82+
Spearman correlation is highly robust and invariant to monotone transformations
83+
(it doesn't rely on any particular functional form for the dependence between
84+
two variables).
85+
86+
We might also interested in interested in how case rates associate with death
87+
rates in the *future*. Using the `dt_x` parameter in `covidcast_cor()`, we can
88+
lag case rates back any number of days we want, before calculating correlations.
10689

10790
```{r, warning = FALSE}
108-
# Repeat this comparison, but now using Spearman (rank) correlation
109-
df_cor1 <- covidcast_cor(inum_act, dnum_act, by = "time_value",
91+
# Use Spearman correlation, with case rates and 10-day lagged case rates
92+
df_cor1 <- covidcast_cor(iprop_act, dprop_act, by = "time_value",
11093
method = "spearman")
111-
df_cor2 <- covidcast_cor(inum_act, dnum_act, by = "time_value", dt_x = -10,
94+
df_cor2 <- covidcast_cor(iprop_act, dprop_act, by = "time_value", dt_x = -10,
11295
method = "spearman")
11396
11497
# Stack rowwise into one data frame, then plot time series
11598
df_cor <- rbind(df_cor1, df_cor2)
11699
df_cor$dt <- as.factor(c(rep(0, nrow(df_cor1)), rep(-10, nrow(df_cor2))))
117100
ggplot(df_cor, aes(x = time_value, y = value)) +
118101
geom_line(aes(color = dt)) +
119-
labs(title = "Correlation between cases and deaths",
102+
labs(title = "Correlation between case and death rates",
120103
subtitle = sprintf("Per day, over counties with at least %i cases",
121104
case_num),
122105
x = "Date", y = "Correlation") +
123106
theme(legend.position = "bottom")
124107
```
125108

126-
The "big dip" is gone (since the Spearman correlation uses ranks and not the
127-
actual values, and hence is less sensitive to outliers), and we can again see
128-
that lagging the cases time series helps correlations.
109+
We can see that, for the most part, the Spearman measure has bolstered the
110+
correlations; and generally, lagging the case rates time series back by 10 days
111+
improves correlations, confirming case rates are better correlated with death
112+
rates 10 days from now.
129113

130114
## Correlations sliced by county
131115

132116
The second option we have is to "slice by location": this calculates, for each
133117
geographic location, correlation between the time series of two signals. This
134118
is obtained by setting `by = "geo_value"`. We'll again look at correlations
135-
both for observations at the same time and for 10-day lagged cases:
119+
both for observations at the same time and for 10-day lagged case rates:
136120

137121
```{r, warning = FALSE}
138122
# Compute correlation per county, over all times
139-
df_cor1 <- covidcast_cor(inum_act, dnum_act, by = "geo_value")
140-
df_cor2 <- covidcast_cor(inum_act, dnum_act, by = "geo_value", dt_x = -10)
123+
df_cor1 <- covidcast_cor(iprop_act, dprop_act, by = "geo_value")
124+
df_cor2 <- covidcast_cor(iprop_act, dprop_act, by = "geo_value", dt_x = -10)
141125
142126
# Stack rowwise into one data frame, then plot densities
143127
df_cor <- rbind(df_cor1, df_cor2)
144128
df_cor$dt <- as.factor(c(rep(0, nrow(df_cor1)), rep(-10, nrow(df_cor2))))
145129
ggplot(df_cor, aes(value)) +
146130
geom_density(aes(color = dt, fill = dt), alpha = 0.5) +
147-
labs(title = "Correlation between cases and deaths",
131+
labs(title = "Correlation between case and death rates",
148132
subtitle = "Computed separately for each county, over all times",
149133
x = "Date", y = "Density") +
150134
theme(legend.position = "bottom")
@@ -162,8 +146,8 @@ attributes(df_cor2)$metadata$geo_type <- "county"
162146
class(df_cor2) <- c("covidcast_signal", "data.frame")
163147
164148
# Plot choropleth maps, using the covidcast plotting functionality
165-
plot(df_cor2, title = "Correlations between 10-day lagged cases and deaths",
166-
range = c(-1, 1), choro_col = c("orange","lightblue", "purple"))
149+
plot(df_cor2, title = "Correlations between 10-day lagged case and death rates",
150+
range = c(-1, 1), choro_col = c("orange", "lightblue", "purple"))
167151
```
168152

169153
## More systematic lag analysis
@@ -177,7 +161,7 @@ this:
177161
dt_vec <- -(0:15)
178162
df_list <- vector("list", length(dt_vec))
179163
for (i in 1:length(dt_vec)) {
180-
df_list[[i]] <- covidcast_cor(inum_act, dnum_act, dt_x = dt_vec[i],
164+
df_list[[i]] <- covidcast_cor(iprop_act, dprop_act, dt_x = dt_vec[i],
181165
by = "geo_value")
182166
df_list[[i]]$dt <- dt_vec[i]
183167
}
@@ -188,11 +172,11 @@ df %>%
188172
group_by(dt) %>%
189173
summarize(median = median(value, na.rm = TRUE), .groups = "drop_last") %>%
190174
ggplot(aes(x = dt, y = median)) + geom_line() + geom_point() +
191-
labs(title = "Median correlation between cases and deaths",
175+
labs(title = "Median correlation between case and death rates",
192176
x = "dt", y = "Correlation") +
193177
theme(legend.position = "bottom", legend.title = element_blank())
194178
```
195179

196-
We can see that the median correlation between cases and deaths (where the
180+
We can see that the median correlation between case and death rates (where the
197181
correlations come from slicing by location) is maximized when we lag the case
198-
incidence numbers back 8 days in time.
182+
incidence rates back 8 days in time.

R-packages/covidcast/vignettes/covidcast.Rmd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
---
22
title: Get started with covidcast
3+
description: An introductory tutorial with examples.
34
output: rmarkdown::html_vignette
45
vignette: >
56
%\VignetteIndexEntry{Get started with covidcast}

R-packages/covidcast/vignettes/multi-signals.Rmd

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
---
2-
title: Manipulating multiple signals
2+
title: 3. Manipulating multiple signals
3+
description: Download multiple signals at once, and aggregate and manipulate them in various ways.
34
output: rmarkdown::html_vignette
45
vignette: >
56
%\VignetteIndexEntry{3. Manipulating multiple signals}

R-packages/covidcast/vignettes/plotting-signals.Rmd

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
---
2-
title: Plotting and mapping signals
2+
title: 1. Plotting and mapping signals
3+
description: Make custom time series plots, choropleth maps, and bubble plots of signals.
34
output: rmarkdown::html_vignette
45
vignette: >
56
%\VignetteIndexEntry{1. Plotting and mapping signals}
@@ -248,4 +249,4 @@ ggplot(df, aes(x = time_value, y = value)) +
248249
```
249250

250251
Again, we see that the combined indicator starts rising several days before the
251-
new COVID-19 cases do, an exciting phenomenon that Delphi is studying now.
252+
new COVID-19 cases do, an exciting phenomenon that Delphi is studying now.

docs/covidcastR/404.html

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/covidcastR/LICENSE-text.html

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)