census_clean_for_sym.R

# load libraries
library(tidyverse)

# read in data generated by NHGIS workflow
tract <- readRDS("tract_final.RDS")

# identify names of variables to use
my_vars <- c(
             "geoid_2010",
             "geoid",
             "year", 
             "total_population",
             "housing_units", 
             "occupied",
             "vacant",
             "renter_occ",
             "white_perc", 
             "black_perc", 
             "asian_perc", 
             "hawaiian_perc",
             "american_alaskan_perc",
             "two_or_more_perc",
             "other_perc", 
             "rural_perc",
             "bachelors_perc",
             "hispanic_perc", 
             "poverty_perc", 
             "unemployment", 
             "turnover_perc",
             "med_family_income_adj",
             "med_gross_rent_adj",
             "med_household_income_adj",
             "median_value_adj")

# keep only tracts that we have for all years after adjustments are made
tract_1 <- tract %>% 
  select(any_of(my_vars)) %>% 
  group_by(geoid) %>% 
  mutate(my_n = n()) %>% 
  ungroup() %>%
  filter(my_n == max(my_n)) %>% 
  select(-my_n)

# read in county data from NHGIS output
county <- readRDS("county_census.RDS") %>% 
  filter(str_sub(geoid, 1, 2) < 60) %>%
  group_by(geoid) %>% 
  mutate(my_n = n()) %>% 
  ungroup() 

# assign counties that do not have consistent codes over time
outliers <- county %>%
  filter(my_n != max(my_n))  %>% 
  select(-my_n)

# assign counties that do have consistent codes overtime
non_outliers <- county %>% 
  filter(my_n == max(my_n)) %>% 
  select(-my_n)


# read in 1990 to 2010 county crosswalk. This crosswalk is downloaded from https://www.openicpsr.org/openicpsr/project/150101/version/V4/view?path=/openicpsr/150101/fcr:versions/V4/crosswalks/CountyToCounty/2010/2010_csv.zip&type=file
xwalk_1990<- read_csv("county/Crosswalk_1990_2010.csv") %>%
  #filter(m1_weight > .01) %>%
  mutate(geoid_1990 = str_c(str_sub(gisjoin_1990, 2, 3), 
                            str_sub(gisjoin_1990, 5, 7)), 
         geoid_2010 = str_c(str_sub(gisjoin_2010, 2, 3), 
                            str_sub(gisjoin_2010, 5, 7))
         ) %>% 
  select(geoid_1990, geoid_2010, m1_weight) %>% 
  filter(m1_weight != 0)

# read in 2000 to 2010 county crosswalk. This crosswalk is downloaded from https://www.openicpsr.org/openicpsr/project/150101/version/V4/view?path=/openicpsr/150101/fcr:versions/V4/crosswalks/CountyToCounty/2010/2010_csv.zip&type=file
xwalk_2000<- read_csv("county/Crosswalk_2000_2010.csv") %>%
 # filter(m1_weight > .01) %>%
  mutate(geoid_2000 = str_c(str_sub(gisjoin_2000, 2, 3), 
                            str_sub(gisjoin_2000, 5, 7)), 
         geoid_2010 = str_c(str_sub(gisjoin_2010, 2, 3), 
                            str_sub(gisjoin_2010, 5, 7))
  ) %>% 
  select(geoid_2000, geoid_2010, m1_weight) %>% 
  filter(m1_weight != 0)


# read in 2020 to 2010 county crosswalk. This crosswalk is downloaded from https://www.openicpsr.org/openicpsr/project/150101/version/V4/view?path=/openicpsr/150101/fcr:versions/V4/crosswalks/CountyToCounty/2010/2010_csv.zip&type=file
xwalk_2020 <- read_csv("county/Crosswalk_2020_2010.csv") %>%
  #filter(m1_weight > .01) %>%
  mutate(geoid_2020 = str_pad(gisjoin_2020, 5, "left", "0"), 
         geoid_2010 = str_c(str_sub(gisjoin_2010, 2, 3), 
                            str_sub(gisjoin_2010, 5, 7))
  ) %>% 
  select(geoid_2020, geoid_2010, m1_weight) %>% 
  filter(m1_weight != 0)


county_xwalk<- bind_rows(xwalk_1990, xwalk_2000, xwalk_2020) %>% # append data together
  pivot_longer(-c(geoid_2010, m1_weight)) %>%  # reshape long
  #filter(!is.na(value)) %>% 
  mutate(year = str_extract(name, "[0-9][0-9][0-9][0-9]")) %>% # extract year
  select(-name) %>%  # remove name variable
  group_by(geoid_2010, value, year) %>% 
  summarise(m1_weight = sum(m1_weight, na.rm = TRUE)) %>% # aggregate weight variable 
  ungroup() %>% 
  group_by(value, year) %>%  # group by value and year
  mutate(total = sum(m1_weight)) %>%  # get total for weight variable
  ungroup() %>% 
  mutate(final_weight = m1_weight / total) %>%  # create final weight
  mutate(dif = geoid_2010 == value) #%>% # create dif variable that is true if geoid_2010 is equal to value
 # filter(final_weight < .9999, final_weight > .0001)

# get geos that changed
changed_geoid_10 <- county_xwalk %>% 
  filter(!dif) %>%
  pull(geoid_2010) %>% 
  unique()

# get geos that changed; the non 2010 version
changed_geoid_oth <- county_xwalk %>% 
  filter(!dif) %>%
  pull(value) %>% 
  unique()


# keep counties that already have correct boundaries (note: actually not true for some vars, we fix later)
good_counties <- county %>%
  filter(!year %in% c("1990", 
                     "2000", 
                     "2005-2009", 
                     "2016-2020", 
                     "2017-2021") ) 


# keep years that need fixing
bad_counties_years <- county %>% 
  filter(year %in% c("1990", 
                     "2000", 
                     "2005-2009", 
                     "2016-2020", 
                     "2017-2021") ) %>% 
  mutate(other_year = case_when(
    year == "2005-2009" ~ "2000",
    year %in% c("2016-2020",
                "2017-2021")~ "2020", 
    TRUE ~ year
  ))


 # join crosswalk 
pre <- bad_counties_years %>% 
  left_join(county_xwalk, by = c("geoid" = "value", "other_year" = "year")) %>% 
  mutate(geoid_2010 = if_else(is.na(geoid_2010), geoid, geoid_2010), 
         dif = geoid_2010 == geoid) %>%  # if 2010 geoid is missing, use geoid, otherwise use geoid 2010
  group_by(geoid, year) %>% 
  mutate(non_same = sum(!dif), 
         final_weight = if_else(non_same == 0, 1, final_weight)) %>% 
  mutate(geoid_2010 = case_when(
    geoid_2010 == "51515" ~ "51019",
    TRUE ~ geoid_2010))


# adjust count  data based on weight
counts <- pre %>% 
  group_by(geoid_2010, year) %>% 
  summarise(across(-c(contains("adj"), 
                      contains("_perc"), 
                      contains("med"),
                      geoid,
                      gisjoin,
                      other_year,
                      avg_cpi:cpi_2021, 
                      base_year), ~sum(. * final_weight, na.rm  = TRUE))) %>% 
  ungroup()


# adjust non count data based on weight; multiply first by a count variable and then divide by the count variable after the transformation
non_counts <- 	pre %>% 
  mutate(geoid = str_sub(geoid, 1, 5)) %>% 
         mutate(t_median_value = median_value * owner_occ, 
                      t_median_value_adj = median_value_adj * owner_occ, 
                      t_med_household_inc = med_household_income * total_households, 
                      t_med_household_inc_adj = med_household_income_adj * total_households, 
                      t_med_gross_rent = med_gross_rent * renter_occ, 
                      t_med_gross_rent_adj = med_gross_rent_adj * renter_occ, 
                      t_med_family_inc = med_family_income * total_population, 
                      t_med_family_inc_adj = med_family_income_adj * total_population) %>% 
  group_by(geoid_2010, year) %>%  
  summarise(across(c(t_median_value:t_med_family_inc_adj, 
                     owner_occ, 
                     total_households, 
                     renter_occ, 
                     total_population), ~sum(. * final_weight))) %>% 
  ungroup() %>%
  mutate(median_value = t_median_value / owner_occ, 
         median_value_adj = t_median_value_adj / owner_occ, 
         med_household_income = t_med_household_inc / total_households,
         med_household_income_adj = t_med_household_inc_adj / total_households,
         med_gross_rent = t_med_gross_rent / renter_occ, 
         med_gross_rent_adj = t_med_gross_rent_adj / renter_occ, 
         med_family_income = t_med_family_inc / total_population,
         med_family_income_adj = t_med_family_inc_adj / total_population) %>% 
  select(-c(starts_with("t_"), 
            owner_occ,
            renter_occ,
            total_households,
            total_population))


# join data together
odd_counties_adj<- left_join(counts,
                             non_counts, 
                             by = c("year", "geoid_2010"))


# manually assign geoids that ended up changing in between 2010 and 2020.
good_counties_1 <- good_counties %>%
  rename(geoid_2010 = geoid) %>%
  mutate(geoid_2010 = case_when(
    geoid_2010 == "02158" ~ "02270", 
    geoid_2010 == "46102" ~ "46113", 
    geoid_2010 == "51515" ~ "51019",
    TRUE ~ geoid_2010))
  
# aggregate data that do not have consistent counties
good_counties_counts <-  good_counties_1 %>% 
  add_count(geoid_2010) %>% 
  filter(n != 10) %>%
  group_by(geoid_2010, year) %>% 
  summarise(across(-c(contains("adj"), 
                      contains("_perc"), 
                      contains("med"),
                      gisjoin,
                      avg_cpi:cpi_2021, 
                      base_year), ~sum(., na.rm  = TRUE))) %>% 
  ungroup()
  
# aggregate non count data that do not have consistent counties
good_counties_non <-  good_counties_1 %>% 
  add_count(geoid_2010) %>% 
  filter(n != 10) %>%
  mutate(t_median_value = median_value * owner_occ, 
         t_median_value_adj = median_value_adj * owner_occ, 
         t_med_household_inc = med_household_income * total_households, 
         t_med_household_inc_adj = med_household_income_adj * total_households, 
         t_med_gross_rent = med_gross_rent * renter_occ, 
         t_med_gross_rent_adj = med_gross_rent_adj * renter_occ, 
         t_med_family_inc = med_family_income * total_population, 
         t_med_family_inc_adj = med_family_income_adj * total_population) %>% 
  group_by(geoid_2010, year) %>%  
  summarise(across(c(t_median_value:t_med_family_inc_adj, 
                     owner_occ, 
                     total_households, 
                     renter_occ, 
                     total_population), ~sum(.))) %>% 
  ungroup() %>%
  mutate(median_value = t_median_value / owner_occ, 
         median_value_adj = t_median_value_adj / owner_occ, 
         med_household_income = t_med_household_inc / total_households,
         med_household_income_adj = t_med_household_inc_adj / total_households,
         med_gross_rent = t_med_gross_rent / renter_occ, 
         med_gross_rent_adj = t_med_gross_rent_adj / renter_occ, 
         med_family_income = t_med_family_inc / total_population,
         med_family_income_adj = t_med_family_inc_adj / total_population) %>% 
  select(-c(starts_with("t_"), 
            owner_occ,
            renter_occ,
            total_households,
            total_population))

# join back to each other
good_counties_full <- good_counties_counts %>% 
  left_join(good_counties_non, by = c("geoid_2010", "year"))

# append with the rest of the data
good_counties_final<- good_counties_1 %>% 
  filter(!geoid_2010 %in% good_counties_full$geoid_2010) %>% 
  bind_rows(good_counties_full)

# append data to other data
county_final <- good_counties_final %>% 
  bind_rows(odd_counties_adj) %>%
mutate(white_perc = white / total_population, 
       black_perc = black / total_population, 
       asian_perc = asian / total_population, 
       hawaiian_perc = hawaiian / total_population, 
       american_alaskan_perc = american_alaskan / total_population, 
       two_or_more_perc = two_or_more / total_population,
       other_perc = other / total_population, 
       rural_perc = rural / total_population, 
       bachelors_perc = bachelors_or_over / education_total, 
       hispanic_perc = hispanic / total_population, 
       poverty_perc = poverty / poverty_total, 
       unemployment = unemployed / employment_total, 
       turnover_perc = moved_last_year / moved_total
)  # create percent variables

# keep only the variables we want
county_final_1 <- county_final %>%
  select( any_of(my_vars)) #%>%

# cbsa_to_county <- read_csv("cbsa2fipsxw.csv") %>% 
#   transmute(cbsacode, 
#          cbsatitle,
#          geoid_2010 = str_c(str_pad(fipsstatecode, 2, "left", "0"),
#                             str_pad(fipscountycode, 3, "left", "0")))

# read in county to cbsa crosswalk and clean
cbsa_to_county <- read_csv("county_to_cbsa.csv", skip = 2) %>%
  janitor::clean_names() %>%
  transmute(cbsa_code, 
            cbsa_title,
            metro = metropolitan_micropolitan_statistical_area,
            geoid_2010 = str_c(str_pad(fips_state_code, 2, "left", "0"),
                               str_pad(fips_county_code, 3, "left", "0"))) %>% 
  mutate(geoid_2010 = case_when(
    geoid_2010 == "51515" ~ "51019",
    TRUE ~ geoid_2010)) %>% 
  distinct()


# join to county data and aggregate count data
msa_counts<- county_final %>% 
  left_join(cbsa_to_county, by = c("geoid_2010")) %>% 
  group_by(cbsa_code, cbsa_title, metro, year) %>% 
  summarise(across(-c(contains("adj"), 
                      contains("_perc"), 
                      contains("med"),
                      gisjoin,
                      geoid_2010,
                      avg_cpi:cpi_2021, 
                      base_year), ~sum(., na.rm  = TRUE))) %>% 
  ungroup()

  
# join to county data and aggregate non count data
msa_non_counts <-  
  county_final %>% 
  left_join(cbsa_to_county, by = c("geoid_2010")) %>% 
  
  mutate(t_median_value = median_value * owner_occ, 
                        t_median_value_adj = median_value_adj * owner_occ, 
                        t_med_household_inc = med_household_income * total_households, 
                        t_med_household_inc_adj = med_household_income_adj * total_households, 
                        t_med_gross_rent = med_gross_rent * renter_occ, 
                        t_med_gross_rent_adj = med_gross_rent_adj * renter_occ, 
                        t_med_family_inc = med_family_income * total_population, 
                        t_med_family_inc_adj = med_family_income_adj * total_population) %>%
  group_by(cbsa_code, cbsa_title, metro, year) %>%
  summarise(across(c(t_median_value:t_med_family_inc_adj, 
                     owner_occ, 
                     total_households, 
                     renter_occ, 
                     total_population), ~sum(.))) %>% 
  ungroup() %>%
  mutate(median_value = t_median_value / owner_occ, 
         median_value_adj = t_median_value_adj / owner_occ, 
         med_household_income = t_med_household_inc / total_households,
         med_household_income_adj = t_med_household_inc_adj / total_households,
         med_gross_rent = t_med_gross_rent / renter_occ, 
         med_gross_rent_adj = t_med_gross_rent_adj / renter_occ, 
         med_family_income = t_med_family_inc / total_population,
         med_family_income_adj = t_med_family_inc_adj / total_population) %>% 
  select(-c(starts_with("t_"), 
            owner_occ,
            renter_occ,
            total_households,
            total_population))

# join data back together and keep only the variables we want

msa_final <- left_join(msa_counts, msa_non_counts, by = c("cbsa_code", 
                                             "cbsa_title", 
                                             "metro", 
                                             "year")) %>% 
	mutate(white_perc = white / total_population, 
				 black_perc = black / total_population, 
				 asian_perc = asian / total_population, 
				 hawaiian_perc = hawaiian / total_population, 
				 american_alaskan_perc = american_alaskan / total_population, 
				 two_or_more_perc = two_or_more / total_population,
				 other_perc = other / total_population, 
				 rural_perc = rural / total_population, 
				 bachelors_perc = bachelors_or_over / education_total, 
				 hispanic_perc = hispanic / total_population, 
				 poverty_perc = poverty / poverty_total, 
				 unemployment = unemployed / employment_total, 
				 turnover_perc = moved_last_year / moved_total
	) %>%
  select(cbsa_code, 
         cbsa_title, 
         metro, 
         any_of(my_vars))

write_out <- function(my_year, my_geo){
  if(my_geo == "county"){
    data <- county_final_1
  } else if(my_geo == "msa"){
    data <- msa_final
  } else {
    data <- tract_1
  }
  data %>% 
    filter(year == my_year) %>% 
    write_csv(str_c(my_geo, "/", my_geo, "_", my_year, ".csv"))
  
}


# write out data
my_years <- county %>% 
  pull(year) %>% 
  unique()

walk(my_years, ~write_out(., my_geo = "county"))
walk(my_years, ~write_out(., my_geo = "msa"))
walk(my_years, ~write_out(., my_geo = "tract"))