-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathvalidate_data_model.R
82 lines (70 loc) · 3.55 KB
/
validate_data_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
library(argparser)
library(AnvilDataModels)
library(AnVIL)
library(readr)
argp <- arg_parser("validate")
argp <- add_argument(argp, "--table_files", help="2-column tsv file with (table name, table tsv file)")
argp <- add_argument(argp, "--model_file", help="json file with data model")
argp <- add_argument(argp, "--hash_id_nchar", default=16, help="number of characters in automatically generated ids")
argp <- add_argument(argp, "--skip_hash_id", flag=TRUE, help="skip automatic id generation")
argp <- add_argument(argp, "--skip_bucket_paths", flag=TRUE, help="skip checking for existence of google bucket paths")
argp <- add_argument(argp, "--use_existing_tables", flag=TRUE, help="for any tables in the data model but not included in table_files, read the existing table from the AnVIL workspace for validation")
argp <- add_argument(argp, "--stop_on_fail", flag=TRUE, help="return an error code if table_files do not pass checks")
argp <- add_argument(argp, "--workspace_name", help="name of AnVIL workspace to import data to")
argp <- add_argument(argp, "--workspace_namespace", help="namespace of AnVIL workspace to import data to")
argv <- parse_args(argp)
# argv <- list(table_files="testdata/table_files.tsv",
# model_file="testdata/data_model.json")
# read data model
model <- json_to_dm(argv$model_file)
# read tables
table_files <- read_tsv(argv$table_files, col_names=c("names", "files"), col_types="cc")
message("tables to validate:")
print(table_files$names)
# check if we need to add any columns to files
if (!argv$skip_hash_id) {
if (length(attr(model, "auto_id")) > 0) {
tables <- read_data_tables(table_files$files, table_names=table_files$names)
# add auto columns
tables2 <- lapply(names(tables), function(t) {
add_auto_columns(tables[[t]], table_name=t, model=model, nchar=argv$hash_id_nchar)
})
names(tables2) <- names(tables)
# write new tables
new_files <- paste("output", names(tables), "table.tsv", sep="_")
names(new_files) <- names(tables)
for (t in names(tables2)) {
write_tsv(tables2[[t]], new_files[t], escape="none")
}
} else {
new_files <- setNames(table_files$files, table_files$names)
}
# write list of tables with names
tibble(name=names(new_files), file=unlist(new_files)) %>%
write_tsv("output_tables.tsv", col_names=FALSE)
check_files <- new_files
} else {
check_files <- setNames(table_files$files, table_files$names)
}
if (argv$use_existing_tables) {
existing_table_names <- avtables(namespace=argv$workspace_namespace, name=argv$workspace_name)$table
required_tables <- AnvilDataModels:::.parse_required_tables(names(check_files), model)$required
workspace_tables <- setdiff(required_tables, names(check_files))
message("tables read from workspace:")
print(workspace_tables)
for (t in workspace_tables) {
dat <- avtable(t, namespace=argv$workspace_namespace, name=argv$workspace_name)
if (grepl("_set$", t)) {
dat <- unnest_set_table(dat)
}
check_files[t] <- paste0(t, "_current.tsv")
write_tsv(dat, check_files[t], escape="none")
}
}
params <- list(tables=check_files, model=argv$model_file, check_bucket_paths=!argv$skip_bucket_paths)
pass <- custom_render_markdown("data_model_report", "data_model_validation", parameters=params)
if (argv$stop_on_fail) {
if (!pass) stop("table_files not compatible with data model; see data_model_validation.html")
} else {
writeLines(tolower(as.character(pass)), "pass.txt")
}