Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add interlacer support via read_interlaced_resource() #213

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -35,6 +35,7 @@ Imports:
cli,
dplyr,
httr,
interlacer (>= 0.3.2),
jsonlite,
purrr,
readr (>= 2.1.0),
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -7,6 +7,7 @@ export(create_package)
export(create_schema)
export(get_schema)
export(problems)
export(read_interlaced_resource)
export(read_package)
export(read_resource)
export(remove_resource)
1 change: 1 addition & 0 deletions R/frictionless-package.R
Original file line number Diff line number Diff line change
@@ -2,4 +2,5 @@
"_PACKAGE"

#' @import rlang

NULL
259 changes: 190 additions & 69 deletions R/read_resource.R
Original file line number Diff line number Diff line change
@@ -16,6 +16,9 @@
#' @param col_select Character vector of the columns to include in the result,
#' in the order provided.
#' Selecting columns can improve read speed.
#' @param interlaced Boolean value indicating if interlaced columns should
#' be loaded using the interlacer package.
#' @param ... arguments to pass to `read_resource()``
#' @return [tibble()] data frame with the Data Resource's tabular data.
#' If there are parsing problems, a warning will alert you.
#' You can retrieve the full details by calling [problems()] on your data
@@ -198,7 +201,8 @@
#'
#' # Read data from the resource "deployments" with column selection
#' read_resource(package, "deployments", col_select = c("latitude", "longitude"))
read_resource <- function(package, resource_name, col_select = NULL) {
read_resource <- function(package, resource_name, col_select = NULL,
interlaced = FALSE) {
# Get resource, includes check_package()
resource <- get_resource(package, resource_name)

@@ -261,77 +265,27 @@ read_resource <- function(package, resource_name, col_select = NULL) {
)

# Create col_types: list(<collector_character>, <collector_logical>, ...)
col_types <- purrr::map(fields, function(x) {
type <- x$type %||% NA_character_
enum <- x$constraints$enum
group_char <- if (x$groupChar %||% "" != "") TRUE else FALSE
bare_number <- if (x$bareNumber %||% "" != FALSE) TRUE else FALSE
format <- x$format %||% "default" # Undefined => default

# Assign types and formats
col_type <- switch(type,
"string" = if (length(enum) > 0) {
readr::col_factor(levels = enum)
} else {
readr::col_character()
},
"number" = if (length(enum) > 0) {
readr::col_factor(levels = as.character(enum))
} else if (group_char) {
readr::col_number() # Supports grouping_mark
} else if (bare_number) {
readr::col_double() # Allows NaN, INF, -INF
} else {
readr::col_number() # Strips non-num. chars, uses default grouping_mark
},
"integer" = if (length(enum) > 0) {
readr::col_factor(levels = as.character(enum))
} else if (bare_number) {
readr::col_double() # Not col_integer() to avoid big integers issues
} else {
readr::col_number() # Strips non-numeric chars
},
"boolean" = readr::col_logical(),
"object" = readr::col_character(),
"array" = readr::col_character(),
"date" = readr::col_date(format = switch(format,
"default" = "%Y-%m-%d", # ISO
"any" = "%AD", # YMD
"%x" = "%m/%d/%y", # Python strptime for %x
format # Default
)),
"time" = readr::col_time(format = switch(format,
"default" = "%AT", # H(MS)
"any" = "%AT", # H(MS)
"%X" = "%H:%M:%S", # HMS
sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds
)),
"datetime" = readr::col_datetime(format = switch(format,
"default" = "", # ISO (lenient)
"any" = "", # ISO (lenient)
sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds
)),
"year" = readr::col_date(format = "%Y"),
"yearmonth" = readr::col_date(format = "%Y-%m"),
"duration" = readr::col_character(),
"geopoint" = readr::col_character(),
"geojson" = readr::col_character(),
"any" = readr::col_character()
)
# col_type will be NULL when type is undefined (NA_character_) or an
# unrecognized value (e.g. "datum", but will be blocked by check_schema()).
# Set those to col_guess().
col_type <- col_type %||% readr::col_guess()
col_type
})

# Assign names: list("name1" = <collector_character>, "name2" = ...)
names(col_types) <- field_names
col_types <- interlacer::as.x_col_spec(
purrr::map(set_names(fields, field_names), field_to_x_col)
)

# Select CSV dialect, see https://specs.frictionlessdata.io/csv-dialect/
# Note that dialect can be NULL
dialect <- read_descriptor(resource$dialect, package$directory, safe = TRUE)

na_col <- missing_values_to_na_col(
schema$missingValues %||% "",
.name = "default"
)

if (interlaced) {
read_fn = interlacer::read_interlaced_delim
na <- na_col
} else {
read_fn = readr::read_delim
na <- na_col$chr_values
}

# Read data directly
if (resource$read_from == "df") {
df <- dplyr::as_tibble(resource$data)
@@ -344,7 +298,7 @@ read_resource <- function(package, resource_name, col_select = NULL) {
} else if (resource$read_from == "path" || resource$read_from == "url") {
dataframes <- list()
for (i in seq_along(paths)) {
data <- readr::read_delim(
data <- read_fn(
file = paths[i],
delim = dialect$delimiter %||% ",",
quote = dialect$quoteChar %||% "\"",
@@ -365,7 +319,7 @@ read_resource <- function(package, resource_name, col_select = NULL) {
# a column, see https://rlang.r-lib.org/reference/topic-data-mask.html
col_select = {{col_select}},
locale = locale,
na = schema$missingValues %||% "",
na = na,
comment = dialect$commentChar %||% "",
trim_ws = dialect$skipInitialSpace %||% FALSE,
# Skip header row when present
@@ -380,3 +334,170 @@ read_resource <- function(package, resource_name, col_select = NULL) {

return(df)
}

#' @rdname read_resource
#' @export
read_interlaced_resource <- function(...) {
read_resource(..., interlaced = TRUE)
}

field_to_x_col <- function(x) {
interlacer::x_col(
field_to_v_col(x),
missing_values_to_na_col(x$missingValues, .name = x$name)
)
}

field_to_v_col <- function(x) {
type <- x$type %||% NA_character_
categories <- x$categories
categoriesOrdered <- x$categoriesOrdered %||% FALSE
group_char <- if (x$groupChar %||% "" != "") TRUE else FALSE
bare_number <- if (x$bareNumber %||% "" != FALSE) TRUE else FALSE
format <- x$format %||% "default" # Undefined => default

# Assign types and formats
col_type <- switch(type,
"string" = if (!is.null(categories)) {
categories_to_v_col(categories, categoriesOrdered, .name = x$name)
} else {
readr::col_character()
},
"number" = if (group_char) {
readr::col_number() # Supports grouping_mark
} else if (bare_number) {
readr::col_double() # Allows NaN, INF, -INF
} else {
readr::col_number() # Strips non-num. chars, uses default grouping_mark
},
"integer" = if (!is.null(categories)) {
categories_to_v_col(categories, categoriesOrdered, .name = x$name)
} else if (bare_number) {
readr::col_double() # Not col_integer() to avoid big integers issues
} else {
readr::col_number() # Strips non-numeric chars
},
"boolean" = readr::col_logical(),
"object" = readr::col_character(),
"array" = readr::col_character(),
"date" = readr::col_date(format = switch(format,
"default" = "%Y-%m-%d", # ISO
"any" = "%AD", # YMD
"%x" = "%m/%d/%y", # Python strptime for %x
format # Default
)),
"time" = readr::col_time(format = switch(format,
"default" = "%AT", # H(MS)
"any" = "%AT", # H(MS)
"%X" = "%H:%M:%S", # HMS
sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds
)),
"datetime" = readr::col_datetime(format = switch(format,
"default" = "", # ISO (lenient)
"any" = "", # ISO (lenient)
sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds
)),
"year" = readr::col_date(format = "%Y"),
"yearmonth" = readr::col_date(format = "%Y-%m"),
"duration" = readr::col_character(),
"geopoint" = readr::col_character(),
"geojson" = readr::col_character(),
"any" = readr::col_character()
)
# col_type will be NULL when type is undefined (NA_character_) or an
# unrecognized value (e.g. "datum", but will be blocked by check_schema()).
# Set those to col_guess().
col_type <- col_type %||% readr::col_guess()
col_type
}

categories_to_v_col <- function(categories, categoriesOrdered, .name) {
if (is.character(categories) || is.numeric(categories)) {
readr::col_factor(
levels = as.character(categories),
ordered = categoriesOrdered
)
} else if (is.list(categories)) {
have_value_prop <- purrr::map_lgl(categories, \(v) !is.null(v$value))

if (!all(have_value_prop)) {
cli::cli_abort(
"categories for {.name} is missing a `value` property"
)
}

values <- purrr::map_vec(categories, \(v) v$value)

labels <- purrr::map_chr(
seq_along(categories),
\(i) categories[[i]]$label %||% categories[[i]]$value
)

if (all(values == labels)) {
readr::col_factor(
levels = as.character(values),
ordered = categoriesOrdered
)
} else {
interlacer::v_col_cfactor(
codes = set_names(values, labels),
ordered = categoriesOrdered
)
}
}
}

missing_values_to_na_col <- function(missing_values, .name) {
if (is.null(missing_values)) {
interlacer::na_col_default()
} else if (is_numeric_coercible(missing_values)) {
inject(interlacer::na_col_integer(!!!as.integer(missing_values)))
} else if (is.character(missing_values)) {
inject(interlacer::na_col_factor(!!!missing_values))
} else if (is.list(missing_values)) {
if (length(missing_values) == 0) {
interlacer::na_col_none()
} else {
have_value_prop <- purrr::map_lgl(missing_values, \(v) !is.null(v$value))

if (!all(have_value_prop)) {
cli::cli_abort(
"missingValues for {.name} is missing a `value` property"
)
}

values <- purrr::map_chr(missing_values, \(v) v$value)

if (is_numeric_coercible(values)) {
values <- as.numeric(values)
}

labels <- purrr::map_chr(
seq_along(missing_values),
\(i) missing_values[[i]]$label %||% missing_values[[i]]$value
)

if (all(values == labels)) {
readr::col_factor(levels = values)
} else {
inject(interlacer::na_col_cfactor(!!!set_names(values, labels)))
}
}
} else {
cli::cli_abort(
"Cannot process missingValues for {.name}; expected list of strings or objects"
)
}
}

is_numeric_coercible <- function(x) {
if (is.numeric(x)) {
TRUE
} else if (is.character(x)) {
n_na <- sum(is.na(x))
conv <- suppressWarnings(as.numeric(x))
n_na == sum(is.na(conv))
} else {
FALSE
}
}
10 changes: 9 additions & 1 deletion man/read_resource.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions tests/testthat/data/type_fct.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
fct_chr,fct_int,ord_chr,ord_int
red,10,red,10
red,20,red,20
green,10,green,10
blue,10,blue,10
6 changes: 6 additions & 0 deletions tests/testthat/data/type_interlaced.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
na_fct,na_int,na_cfct,na_none,na_default,cfct_chr,cfct_int
1.1,1.1,1.1,1,1,a,10
2.2,2.2,2.2,2,2,b,20
OMITTED,-99,-99,3,OMITTED,a,10
REFUSED,-98,-98,3,3,b,20
5.5,5.5,5.5,4,4,a,10
78 changes: 58 additions & 20 deletions tests/testthat/data/types.json
Original file line number Diff line number Diff line change
@@ -13,12 +13,7 @@
{
"name": "str_factor",
"type": "string",
"constraints": {
"enum": [
"foo",
"bar"
]
}
"categories": ["foo", "bar"]
}
]
}
@@ -40,13 +35,7 @@
{
"name": "num_factor",
"type": "number",
"constraints": {
"enum": [
3.1,
3.2,
3.3
]
}
"notes": "This is deprecated; factors should only be int and string"
},
{
"name": "num_nan",
@@ -94,13 +83,7 @@
{
"name": "int_factor",
"type": "integer",
"constraints": {
"enum": [
3,
4,
-1
]
}
"categories": [3, 4, -1]
},
{
"name": "int_ws",
@@ -294,6 +277,61 @@
}
]
}
},
{
"name": "interlaced",
"path": "type_interlaced.csv",
"profile": "tabular-data-resource",
"schema": {
"missingValues": ["OMITTED"],
"fields": [
{
"name": "na_fct",
"type": "number",
"missingValues": ["OMITTED", "REFUSED"]
},
{
"name": "na_int",
"type": "number",
"missingValues": ["-99", "-98"]
},
{
"name": "na_cfct",
"type": "number",
"missingValues": [
{ "value": "-99", "label": "OMITTED" },
{ "value": "-98", "label": "REFUSED" }
]
},
{
"name": "na_none",
"type": "integer",
"missingValues": []
},
{
"name": "na_default",
"type": "number"
},
{
"name": "cfct_chr",
"type": "string",
"categories": [
{ "value": "a", "label": "APPLE" },
{ "value": "b", "label": "BANANA" }
],
"missingValues": []
},
{
"name": "cfct_int",
"type": "integer",
"categories": [
{ "value": 10, "label": "APPLE" },
{ "value": 20, "label": "BANANA" }
],
"missingValues": []
}
]
}
}
]
}
52 changes: 42 additions & 10 deletions tests/testthat/test-read_resource.R
Original file line number Diff line number Diff line change
@@ -582,10 +582,10 @@ test_that("read_resource() handles strings", {
resource <- read_resource(p, "string")
expect_type(resource$str, "character")

# Use factor when enum is present
enum <- p$resources[[1]]$schema$fields[[2]]$constraints$enum
# Use factor when categories are present
categories <- p$resources[[1]]$schema$fields[[2]]$categories
expect_s3_class(resource$str_factor, "factor")
expect_identical(levels(resource$str_factor), enum)
expect_identical(levels(resource$str_factor), categories)
})

test_that("read_resource() handles numbers", {
@@ -598,10 +598,8 @@ test_that("read_resource() handles numbers", {
expect_type(resource$num_neg, "double")
expect_true(all(resource$num_neg == -3))

# Use factor when enum is present
enum <- p$resources[[2]]$schema$fields[[3]]$constraints$enum
expect_s3_class(resource$num_factor, "factor")
expect_identical(levels(resource$num_factor), as.character(enum))
# Use factor when enum is present (deprecated)
# TODO: remove num_factor from test file

# NaN, INF, -INF are supported, case-insensitive
expect_type(resource$num_nan, "double")
@@ -631,10 +629,10 @@ test_that("read_resource() handles integers (as doubles)", {
expect_type(resource$int_neg, "double")
expect_true(all(resource$int_neg == -3))

# Use factor when enum is present
enum <- p$resources[[3]]$schema$fields[[3]]$constraints$enum
# Use factor when categories are present
categories <- p$resources[[3]]$schema$fields[[3]]$categories
expect_s3_class(resource$int_factor, "factor")
expect_identical(levels(resource$int_factor), as.character(enum))
expect_identical(levels(resource$int_factor), as.character(categories))

# bareNumber = false allows whitespace and non-numeric characters
expect_type(resource$int_ws, "double")
@@ -738,3 +736,37 @@ test_that("read_resource() handles other types", {
# Guess undefined types, unknown types are blocked by check_schema()
expect_type(resource$no_type, "logical")
})

test_that("read_resource() handles interlaced types", {
p <- read_package(test_path("data/types.json"))
resource <- read_interlaced_resource(p, "interlaced")

# Interpret fct missing reasons
expect_s3_class(interlacer::na_channel(resource$na_fct), "factor")
expect_true(resource$na_fct[[3]] == interlacer::na("OMITTED"))

# Interpret int missing reasons
expect_type(interlacer::na_channel(resource$na_fct), "integer")
expect_true(resource$na_int[[3]] == interlacer::na(-99))

# Interpret cfct missing reasons
expect_s3_class(interlacer::na_channel(resource$na_cfct), "interlacer_cfactor")
expect_true(resource$na_cfct[[3]] == interlacer::na("OMITTED"))

# Interpret none missing reasons
expect_s3_class(resource$na_none, NA)

# Interpret default missing reasons
expect_s3_class(interlacer::na_channel(resource$na_default), "factor")
expect_true(resource$na_default[[3]] == interlacer::na("OMITTED"))

# Interpret cfct_chr
expect_s3_class(resource$cfct_chr, "interlacer_cfactor")
expect_true(resource$cfct_chr[[3]] == "APPLE")
expect_true(interlacer::as.codes(resource$cfct_chr[[3]]) == "a")

# Interpret cfct_int
expect_s3_class(resource$cfct_int, "interlacer_cfactor")
expect_true(resource$cfct_int[[3]] == "APPLE")
expect_true(interlacer::as.codes(resource$cfct_int[[3]]) == 10)
})