UI-Research · tinatinc · Oct 17, 2024 · Oct 29, 2024 · Dec 30, 2024 · Dec 30, 2024
diff --git a/06_neighborhoods/Transportation/final/transit_cost_all_city.csv b/06_neighborhoods/Transportation/final/transit_cost_all_city.csv
diff --git a/06_neighborhoods/Transportation/final/transit_cost_all_county.csv b/06_neighborhoods/Transportation/final/transit_cost_all_county.csv
diff --git a/06_neighborhoods/Transportation/final/transit_cost_all_subgroups_city.csv b/06_neighborhoods/Transportation/final/transit_cost_all_subgroups_city.csv
diff --git a/06_neighborhoods/Transportation/final/transit_cost_all_subgroups_county.csv b/06_neighborhoods/Transportation/final/transit_cost_all_subgroups_county.csv
diff --git a/06_neighborhoods/Transportation/final/transit_trips_all_city.csv b/06_neighborhoods/Transportation/final/transit_trips_all_city.csv
diff --git a/06_neighborhoods/Transportation/final/transit_trips_all_county.csv b/06_neighborhoods/Transportation/final/transit_trips_all_county.csv
diff --git a/06_neighborhoods/Transportation/final/transit_trips_all_subgroups_city.csv b/06_neighborhoods/Transportation/final/transit_trips_all_subgroups_city.csv
diff --git a/06_neighborhoods/Transportation/final/transit_trips_all_subgroups_county.csv b/06_neighborhoods/Transportation/final/transit_trips_all_subgroups_county.csv
diff --git a/06_neighborhoods/Transportation/transit_cost_county.qmd b/06_neighborhoods/Transportation/transit_cost_county.qmd
@@ -14,7 +14,7 @@ editor_options:
   chunk_output_type: console
 ---
 
-*2023-2024 Mobility Metrics update* 
+*2024-2025 Mobility Metrics update* 
 
 SUMMARY-LEVEL VALUES 
 
@@ -36,16 +36,26 @@ library(tidyverse)
 library(purrr)
 ```
 
+Read in final evaluation function forlater
+```{r}
+source(here::here("functions", "testing", "evaluate_final_data.R"))
+```
+
+
 ## Read data
 
 The data from HUD cannot be easily read directly into this program.
 Before running, please download the files below from the following [Box
 folder](https://urbanorg.app.box.com/folder/250262697073) into the
-repository folder
+repository folder. To do this, you will have to create a "data" folder within
+the transportation folder. This data folder will not be published outside your 
+branch as it is gitignored (to preserve the file structure as only for code, not
+raw data files).
 `"mobility-from-poverty\06_neighborhoods\Transportation\data"`
 
 -   htaindex2015_data_counties.csv
 -   htaindex2019_data_counties.csv
+-   htaindex2020_data_counties.csv
 
 Import all the files (and/or combine into one file) with only the
 relevant variables and years
@@ -110,6 +120,36 @@ transportation_cost_county_2019 <- transport_county_2019 %>%
   select(state, county, blkgrps, population, households, t_80ami)
 ```
 
+### 2020
+
+```{r}
+transport_county_2020 <- read_csv(here::here("06_neighborhoods",
+                                             "Transportation", 
+                                             "data", 
+                                             "htaindex2020_data_counties.csv"))
+
+
+transport_county_2020 <- transport_county_2020 %>%
+  select(county, blkgrps, population, households, t_80ami)
+```
+
+create correct FIPS columns
+
+```{r}
+transport_county_2020 <- transport_county_2020 %>%
+  mutate(
+    state = substr(county, start = 2, stop = 3),
+    county = substr(county, start = 4, stop = 6)
+  )
+```
+
+Keep only variables of interest
+
+```{r}
+transportation_cost_county_2020 <- transport_county_2020 %>%
+  select(state, county, blkgrps, population, households, t_80ami)
+```
+
 
 Compare to our official county file to make sure we have all counties accounted for
 
@@ -122,12 +162,26 @@ counties <- read_csv(here::here("geographic-crosswalks",
 
 counties_2015 <- counties %>%
   filter(year == 2015)
+cat("Number of rows in counties_2015:", nrow(counties_2015), "\n")
 
 counties_2019 <- counties %>%
   filter(year == 2019)
+cat("Number of rows in counties_2019:", nrow(counties_2019), "\n")
+
+counties_2020 <- counties %>%
+  filter(year == 2020)
+cat("Number of rows in counties_2020:", nrow(counties_2020), "\n")
 ```
 
-All files have same number of observations (3142) so no merging needed to account for missings!
+The 2015 and 2019 files have the same number of observations (3142, because we still include the 8 CT counties). 2020 file has 3,143 for due to the Alaska county split. Checking that's the case below:
+
+```{r}
+unique_to_2020 <- counties_2020 %>%
+  anti_join(counties_2015, by = c("county_name", "state"))
+```
+
+But no data is MISSING, these represent accurate expectations based on each year, so no merging needed to account for missings.
+
 
 ## QC Checks
 
@@ -181,6 +235,7 @@ if (length(missing_indices) > 0) {
 
 1 missing value: Loving County, TX (48301 FIPS).
 
+
 County-Level Transportation Cost 2019
 
 ```{r}
@@ -225,7 +280,54 @@ if (length(missing_indices) > 0) {
 }
 ```
 
-No missing values for 2019.
+No missing values for 2020.
+
+County-Level Transportation Cost 2020
+
+```{r}
+ggplot(transportation_cost_county_2020, aes(x=t_80ami)) + geom_histogram(binwidth=10) + labs(y="number of counties", x="Annual Transit Cost for the Regional Moderate Income Household, 2020")
+```
+
+Look at summary stats
+```{r}
+summary(transportation_cost_county_2020$t_80ami) 
+```
+
+Examine outliers
+```{r}
+transportation_cost_county_2020_outliers <- transportation_cost_county_2020 %>% 
+  filter(t_80ami>100) 
+```
+
+No weird outliers
+
+Use stopifnot to check if all values in "transportation_cost_county_2020" are non-negative
+
+```{r}
+stopifnot(min(transportation_cost_county_2020$t_80ami, na.rm = TRUE) >= 0)
+```
+
+Good to go.
+
+Find indices of missing values for the "transit_cost_80ami" variable
+
+```{r}
+missing_indices <- which(is.na(transportation_cost_county_2020$t_80ami))
+```
+
+Print observations with missing values
+
+```{r}
+if (length(missing_indices) > 0) {
+  cat("Observations with missing values for transit_cost_80ami:\n")
+  print(transportation_cost_county_2020[missing_indices, , drop = FALSE])
+} else {
+  cat("No missing values for transportation_cost_county_2020\n")
+}
+```
+
+One missing value - state 48 (TX), county 243 (Jeff Davis County) - for 2020 data.
+
 
 ## Data quality marker
 
@@ -234,6 +336,7 @@ Determine data quality cutoffs based on number of observations (all at the HH le
 ```{r}
 summary(transportation_cost_county_2015$households) 
 summary(transportation_cost_county_2019$households) 
+summary(transportation_cost_county_2020$households) 
 ```
 
 We use a 30 HH cutoff for Data Quality 3 for the ACS variables, so for the sake of consistency, since none of these are less than 30 (all minimum values are at least 30 HHs), Data Quality can be 1 for all these observations BUT ALSO, rename all the metrics variables to what we had before (transit_trips & transit_cost), so we can name the quality variable appropriately
@@ -245,6 +348,9 @@ transportation_cost_county_2015 <- transportation_cost_county_2015 %>%
 transportation_cost_county_2019 <- transportation_cost_county_2019 %>% 
   rename(transit_cost = t_80ami) %>%
   mutate(transit_cost_quality = 1)
+transportation_cost_county_2020 <- transportation_cost_county_2020 %>% 
+  rename(transit_cost = t_80ami) %>%
+  mutate(transit_cost_quality = 1)
 ```
 
 ## Export files
@@ -257,6 +363,7 @@ transportation_cost_county_2015 <- transportation_cost_county_2015 %>%
     year = 2015,
     transit_cost = transit_cost/100
   )
+cat("Number of rows in transportation_cost_county_2015:", nrow(transportation_cost_county_2015), "\n")
 ```
 
 ```{r}
@@ -265,16 +372,63 @@ transportation_cost_county_2019 <- transportation_cost_county_2019 %>%
     year = 2019,
     transit_cost = transit_cost/100
   )
+cat("Number of rows in transportation_cost_county_2019:", nrow(transportation_cost_county_2019), "\n")
+```
+
+```{r}
+transportation_cost_county_2020 <- transportation_cost_county_2020 %>%
+  mutate(
+    year = 2020,
+    transit_cost = transit_cost/100
+  )
+cat("Number of rows in transportation_cost_county_2020:", nrow(transportation_cost_county_2020), "\n")
 ```
 
-Combine the two years into one overall files for both variables
+Combine the three years into one overall file for both variables
 
 ```{r}
-transit_cost_county <- rbind(transportation_cost_county_2015, transportation_cost_county_2019)
+transit_cost_county <- rbind(
+  transportation_cost_county_2015,
+  transportation_cost_county_2019,
+  transportation_cost_county_2020
+)
+
+cat("Number of rows in transit_cost_county:", nrow(transit_cost_county), "\n")
 ```
 
+Combined file has 9427 observations, which is correct (3142+3142+3143)
 Keep variables of interest and order them appropriately also rename to correct var names
 
+Let's check how the distributions look over time across counties:
+
+```{r}
+ggplot(transit_cost_county, aes(x = factor(year), y = index_transportation_cost, fill = factor(year))) +
+  geom_boxplot(outlier.colour = "red", outlier.shape = 1, alpha = 0.7) +
+  labs(
+    title = "Distribution of Transit Costs by County (2015, 2019, 2020)",
+    x = "Year",
+    y = "Transit Cost",
+    fill = "Year"
+  ) +
+  theme_minimal() +
+  theme(legend.position = "top")
+
+ggplot(transit_cost_county, aes(x = index_transportation_cost, fill = factor(year), color = factor(year))) +
+  geom_density(alpha = 0.4) +
+  labs(
+    title = "Density Distribution of Transit Costs by County (2015, 2019, 2020)",
+    x = "Transit Cost",
+    y = "Density",
+    fill = "Year",
+    color = "Year"
+  ) +
+  theme_minimal() +
+  theme(legend.position = "top")
+```
+
+The distributions are comparable, which is promising. Costs increased from 2015 to 2019, and then decreased a lot in 2020 to below 2015 levels, which is all rational given what we know about how COVID affected human movement.
+
+
 ```{r}
 transit_cost_county <- transit_cost_county %>%
   rename(index_transportation_cost = transit_cost,
@@ -294,5 +448,12 @@ write_csv(transit_cost_county, here::here("06_neighborhoods",
 ```
 
 
+Run final data test function:
+
+```{r}
+evaluate_final_data(exp_form_path = "10a_final-evaluation/evaluation_form_transit_cost_all_county_TC.csv",
+                data = transit_trips_county,  geography = "county",
+                subgroups = FALSE, confidence_intervals = FALSE)
+```