-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01-create-database.R
153 lines (116 loc) · 4.67 KB
/
01-create-database.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# This script creates a Monet database and populates it with the tables needed
# to generate mode- and race-specific origin-destination flow matrices.
library(DBI)
library(MonetDBLite)
library(readr)
library(tidyr)
library(dplyr)
# Create a local in-memory Monet database to store CTPP data
dbdir <- "monet_ctpp"
con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)
# Required lookup tables -------------------------------------------------------
# monetdb.read.csv() doesn't work directly with these lookup table files
# due to an encoding issue. Best solution is to read them first into R and then
# into the database using dbWriteTable().
lookup_res <- read_delim("lookup_tables/acs_2006thru2010_ctpp_res_geo.txt",
delim = "|",
guess_max = 200000)
lookup_pow <- read_delim("lookup_tables/acs_2006thru2010_ctpp_pow_geo.txt",
delim = "|",
guess_max = 200000)
table_shell <- read_delim("lookup_tables/acs_2006thru2010_ctpp_table_shell.txt",
delim = "|",
guess_max = 200000)
# Convert variable names to lower case and change reserved words
names(lookup_res) <- tolower(names(lookup_res))
names(lookup_pow) <- tolower(names(lookup_pow))
names(table_shell) <- tolower(names(table_shell))
names(lookup_res)[names(lookup_res) == "year"] <- "year_"
names(lookup_pow)[names(lookup_pow) == "year"] <- "year_"
# Write the tables to the db
dbWriteTable(con, "lookupres", lookupres)
dbWriteTable(con, "lookuppow", lookuppow)
dbWriteTable(con, "tableshell", tableshell)
# Required tables for Part 1 - Place of residence ------------------------------
# Part 1 tables can generally be read directly into the database using
# monetdb.read.csv().
# Table A102106 - Means of transportation
monetdb.read.csv(con,
"data/TX_2006thru2010_A102106.csv",
"a102106",
nrow.check = 200000,
lower.case.names = TRUE)
# Table B102201 - Minority status by means of transportation (most detailed)
b102201 <- read_csv("data/TX_2006thru2010_B102201.csv")
names(b102201) <- tolower(names(b102201))
dbWriteTable(con, "b102201", b102201)
rm(b102201)
# Required tables for Part 2 - Place of work -----------------------------------
# Table B202200 - Minority status by means of transportation (most detailed)
b202200 <- read_csv("data/TX_2006thru2010_B202200.csv")
names(b202200) <- tolower(names(b202200))
dbWriteTable(con, "b202200", b202200)
rm(b202200)
# Table A202103 - Earnings in the past 12 months
a202103 <- read_csv("data/TX_2006thru2010_A202103.csv")
names(a202103) <- tolower(names(a202103))
dbWriteTable(con, "a202103", a202103)
rm(a202103)
# Required tables for Part 3 - Flow --------------------------------------------
# Part 3 tables cannot be read directly into the database using
# monetdb.read.csv(). The "source" column throws an error because it contains
# text in the final row. In general that column seems a little strange.
# Instead, read required tables directly into R and use dbWriteTable().
# Table A302103 - Means of transportation
# Table B302105 - Minority status
table_names <- c("a302103", "b302105")
a302103 <- read_csv("data/TX_2006thru2010_A302103.csv")
names(a302103) <- tolower(names(a302103))
dbWriteTable(con, "a302103", a302103)
b302105 <- read_csv("data/TX_2006thru2010_B302105.csv")
names(b302105) <- tolower(names(b302105))
dbWriteTable(con, "b302105", b302105)
# Extract tract-level geographic identifiers from flow tables
# Create separate origin and destination variables
for(i in table_names) {
# Define a summary level variable (why isn't this in the table to begin with?)
dbSendQuery(
con, paste0("
ALTER TABLE ", i, "
ADD COLUMN sumlev char(3)"))
dbSendQuery(
con, paste0("
UPDATE ", i, "
SET sumlev = SUBSTRING(geoid, 1, 3)"))
# Create a new table containing tract-tract pairs (sumlev C54)
dbSendQuery(
con, paste0("
CREATE TABLE ", i,"_tract
AS SELECT geoid, lineno, est, se
FROM ", i, "
WHERE sumlev = 'C54'"))
dbSendQuery(
con, paste0("
ALTER TABLE ", i, "_tract
ADD COLUMN origin char(11)"))
dbSendQuery(
con, paste0("
ALTER TABLE ", i, "_tract
ADD COLUMN destination char(11)"))
dbSendQuery(
con, paste0("
UPDATE ", i, "_tract
SET origin = SUBSTRING(geoid, 8, 11)"))
dbSendQuery(
con, paste0("
UPDATE ", i, "_tract
SET destination = SUBSTRING(geoid, 19, 11)"))
}
# Clean up
rm(a302103)
rm(b302105)
# dbRemoveTable(con, "a302103")
# dbRemoveTable(con, "a302103_tract")
# dbRemoveTable(con, "b302105")
# dbRemoveTable(con, "b302105_tract")
dbDisconnect(con, shutdown = TRUE)