frictionlessdata · khusmann · Jun 27, 2024 · Jun 27, 2024 · Jun 27, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -35,6 +35,7 @@ Imports:
     cli,
     dplyr,
     httr,
+    interlacer (>= 0.3.2),
     jsonlite,
     purrr,
     readr (>= 2.1.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ export(create_package)
 export(create_schema)
 export(get_schema)
 export(problems)
+export(read_interlaced_resource)
 export(read_package)
 export(read_resource)
 export(remove_resource)

diff --git a/R/frictionless-package.R b/R/frictionless-package.R
@@ -2,4 +2,5 @@
 "_PACKAGE"
 
 #' @import rlang
+
 NULL
diff --git a/R/read_resource.R b/R/read_resource.R
@@ -16,6 +16,9 @@
 #' @param col_select Character vector of the columns to include in the result,
 #'   in the order provided.
 #'   Selecting columns can improve read speed.
+#' @param interlaced Boolean value indicating if interlaced columns should
+#' be loaded using the interlacer package.
+#' @param ... arguments to pass to `read_resource()``
 #' @return [tibble()] data frame with the Data Resource's tabular data.
 #'   If there are parsing problems, a warning will alert you.
 #'   You can retrieve the full details by calling [problems()] on your data
@@ -198,7 +201,8 @@
 #'
 #' # Read data from the resource "deployments" with column selection
 #' read_resource(package, "deployments", col_select = c("latitude", "longitude"))
-read_resource <- function(package, resource_name, col_select = NULL) {
+read_resource <- function(package, resource_name, col_select = NULL,
+                          interlaced = FALSE) {
   # Get resource, includes check_package()
   resource <- get_resource(package, resource_name)
 
@@ -261,77 +265,27 @@ read_resource <- function(package, resource_name, col_select = NULL) {
   )
 
   # Create col_types: list(<collector_character>, <collector_logical>, ...)
-  col_types <- purrr::map(fields, function(x) {
-    type <- x$type %||% NA_character_
-    enum <- x$constraints$enum
-    group_char <- if (x$groupChar %||% "" != "") TRUE else FALSE
-    bare_number <- if (x$bareNumber %||% "" != FALSE) TRUE else FALSE
-    format <- x$format %||% "default" # Undefined => default
-
-    # Assign types and formats
-    col_type <- switch(type,
-      "string" = if (length(enum) > 0) {
-        readr::col_factor(levels = enum)
-      } else {
-        readr::col_character()
-      },
-      "number" = if (length(enum) > 0) {
-        readr::col_factor(levels = as.character(enum))
-      } else if (group_char) {
-        readr::col_number() # Supports grouping_mark
-      } else if (bare_number) {
-        readr::col_double() # Allows NaN, INF, -INF
-      } else {
-        readr::col_number() # Strips non-num. chars, uses default grouping_mark
-      },
-      "integer" = if (length(enum) > 0) {
-        readr::col_factor(levels = as.character(enum))
-      } else if (bare_number) {
-        readr::col_double() # Not col_integer() to avoid big integers issues
-      } else {
-        readr::col_number() # Strips non-numeric chars
-      },
-      "boolean" = readr::col_logical(),
-      "object" = readr::col_character(),
-      "array" = readr::col_character(),
-      "date" = readr::col_date(format = switch(format,
-        "default" = "%Y-%m-%d", # ISO
-        "any" = "%AD", # YMD
-        "%x" = "%m/%d/%y", # Python strptime for %x
-        format # Default
-      )),
-      "time" = readr::col_time(format = switch(format,
-        "default" = "%AT", # H(MS)
-        "any" = "%AT", # H(MS)
-        "%X" = "%H:%M:%S", # HMS
-        sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds
-      )),
-      "datetime" = readr::col_datetime(format = switch(format,
-        "default" = "", # ISO (lenient)
-        "any" = "", # ISO (lenient)
-        sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds
-      )),
-      "year" = readr::col_date(format = "%Y"),
-      "yearmonth" = readr::col_date(format = "%Y-%m"),
-      "duration" = readr::col_character(),
-      "geopoint" = readr::col_character(),
-      "geojson" = readr::col_character(),
-      "any" = readr::col_character()
-    )
-    # col_type will be NULL when type is undefined (NA_character_) or an
-    # unrecognized value (e.g. "datum", but will be blocked by check_schema()).
-    # Set those to col_guess().
-    col_type <- col_type %||% readr::col_guess()
-    col_type
-  })
-
-  # Assign names: list("name1" = <collector_character>, "name2" = ...)
-  names(col_types) <- field_names
+  col_types <- interlacer::as.x_col_spec(
+    purrr::map(set_names(fields, field_names), field_to_x_col)
+  )
 
   # Select CSV dialect, see https://specs.frictionlessdata.io/csv-dialect/
   # Note that dialect can be NULL
   dialect <- read_descriptor(resource$dialect, package$directory, safe = TRUE)
 
+  na_col <- missing_values_to_na_col(
+    schema$missingValues %||% "",
+    .name = "default"
+  )
+
+  if (interlaced) {
+    read_fn = interlacer::read_interlaced_delim
+    na <- na_col
+  } else {
+    read_fn = readr::read_delim
+    na <- na_col$chr_values
+  }
+
   # Read data directly
   if (resource$read_from == "df") {
     df <- dplyr::as_tibble(resource$data)
@@ -344,7 +298,7 @@ read_resource <- function(package, resource_name, col_select = NULL) {
   } else if (resource$read_from == "path" || resource$read_from == "url") {
     dataframes <- list()
     for (i in seq_along(paths)) {
-      data <- readr::read_delim(
+      data <- read_fn(
         file = paths[i],
         delim = dialect$delimiter %||% ",",
         quote = dialect$quoteChar %||% "\"",
@@ -365,7 +319,7 @@ read_resource <- function(package, resource_name, col_select = NULL) {
         # a column, see https://rlang.r-lib.org/reference/topic-data-mask.html
         col_select = {{col_select}},
         locale = locale,
-        na = schema$missingValues %||% "",
+        na = na,
         comment = dialect$commentChar %||% "",
         trim_ws = dialect$skipInitialSpace %||% FALSE,
         # Skip header row when present
@@ -380,3 +334,170 @@ read_resource <- function(package, resource_name, col_select = NULL) {
 
   return(df)
 }
+
+#' @rdname read_resource
+#' @export
+read_interlaced_resource <- function(...) {
+  read_resource(..., interlaced = TRUE)
+}
+
+field_to_x_col <- function(x) {
+  interlacer::x_col(
+    field_to_v_col(x),
+    missing_values_to_na_col(x$missingValues, .name = x$name)
+  )
+}
+
+field_to_v_col <- function(x) {
+  type <- x$type %||% NA_character_
+  categories <- x$categories
+  categoriesOrdered <- x$categoriesOrdered %||% FALSE
+  group_char <- if (x$groupChar %||% "" != "") TRUE else FALSE
+  bare_number <- if (x$bareNumber %||% "" != FALSE) TRUE else FALSE
+  format <- x$format %||% "default" # Undefined => default
+
+  # Assign types and formats
+  col_type <- switch(type,
+    "string" = if (!is.null(categories)) {
+      categories_to_v_col(categories, categoriesOrdered, .name = x$name)
+    } else {
+      readr::col_character()
+    },
+    "number" = if (group_char) {
+      readr::col_number() # Supports grouping_mark
+    } else if (bare_number) {
+      readr::col_double() # Allows NaN, INF, -INF
+    } else {
+      readr::col_number() # Strips non-num. chars, uses default grouping_mark
+    },
+    "integer" = if (!is.null(categories)) {
+      categories_to_v_col(categories, categoriesOrdered, .name = x$name)
+    } else if (bare_number) {
+      readr::col_double() # Not col_integer() to avoid big integers issues
+    } else {
+      readr::col_number() # Strips non-numeric chars
+    },
+    "boolean" = readr::col_logical(),
+    "object" = readr::col_character(),
+    "array" = readr::col_character(),
+    "date" = readr::col_date(format = switch(format,
+      "default" = "%Y-%m-%d", # ISO
+      "any" = "%AD", # YMD
+      "%x" = "%m/%d/%y", # Python strptime for %x
+      format # Default
+    )),
+    "time" = readr::col_time(format = switch(format,
+      "default" = "%AT", # H(MS)
+      "any" = "%AT", # H(MS)
+      "%X" = "%H:%M:%S", # HMS
+      sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds
+    )),
+    "datetime" = readr::col_datetime(format = switch(format,
+      "default" = "", # ISO (lenient)
+      "any" = "", # ISO (lenient)
+      sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds
+    )),
+    "year" = readr::col_date(format = "%Y"),
+    "yearmonth" = readr::col_date(format = "%Y-%m"),
+    "duration" = readr::col_character(),
+    "geopoint" = readr::col_character(),
+    "geojson" = readr::col_character(),
+    "any" = readr::col_character()
+  )
+  # col_type will be NULL when type is undefined (NA_character_) or an
+  # unrecognized value (e.g. "datum", but will be blocked by check_schema()).
+  # Set those to col_guess().
+  col_type <- col_type %||% readr::col_guess()
+  col_type
+}
+
+categories_to_v_col <- function(categories, categoriesOrdered, .name) {
+  if (is.character(categories) || is.numeric(categories)) {
+    readr::col_factor(
+      levels = as.character(categories),
+      ordered = categoriesOrdered
+    )
+  } else if (is.list(categories)) {
+    have_value_prop <- purrr::map_lgl(categories, \(v) !is.null(v$value))
+
+    if (!all(have_value_prop)) {
+      cli::cli_abort(
+        "categories for {.name} is missing a `value` property"
+      )
+    }
+
+    values <- purrr::map_vec(categories, \(v) v$value)
+
+    labels <- purrr::map_chr(
+      seq_along(categories),
+      \(i) categories[[i]]$label %||% categories[[i]]$value
+    )
+
+    if (all(values == labels)) {
+      readr::col_factor(
+        levels = as.character(values),
+        ordered = categoriesOrdered
+      )
+    } else {
+      interlacer::v_col_cfactor(
+        codes = set_names(values, labels),
+        ordered = categoriesOrdered
+      )
+    }
+  }
+}
+
+missing_values_to_na_col <- function(missing_values, .name) {
+  if (is.null(missing_values)) {
+    interlacer::na_col_default()
+  } else if (is_numeric_coercible(missing_values)) {
+    inject(interlacer::na_col_integer(!!!as.integer(missing_values)))
+  } else if (is.character(missing_values)) {
+    inject(interlacer::na_col_factor(!!!missing_values))
+  } else if (is.list(missing_values)) {
+    if (length(missing_values) == 0) {
+      interlacer::na_col_none()
+    } else {
+      have_value_prop <- purrr::map_lgl(missing_values, \(v) !is.null(v$value))
+
+      if (!all(have_value_prop)) {
+        cli::cli_abort(
+          "missingValues for {.name} is missing a `value` property"
+        )
+      }
+
+      values <- purrr::map_chr(missing_values, \(v) v$value)
+
+      if (is_numeric_coercible(values)) {
+        values <- as.numeric(values)
+      }
+
+      labels <- purrr::map_chr(
+        seq_along(missing_values),
+        \(i) missing_values[[i]]$label %||% missing_values[[i]]$value
+      )
+
+      if (all(values == labels)) {
+        readr::col_factor(levels = values)
+      } else {
+        inject(interlacer::na_col_cfactor(!!!set_names(values, labels)))
+      }
+    }
+  } else {
+    cli::cli_abort(
+      "Cannot process missingValues for {.name}; expected list of strings or objects"
+    )
+  }
+}
+
+is_numeric_coercible <- function(x) {
+  if (is.numeric(x)) {
+    TRUE
+  } else if (is.character(x)) {
+    n_na <- sum(is.na(x))
+    conv <- suppressWarnings(as.numeric(x))
+    n_na == sum(is.na(conv))
+  } else {
+    FALSE
+  }
+}
diff --git a/man/read_resource.Rd b/man/read_resource.Rd
diff --git a/tests/testthat/data/type_fct.csv b/tests/testthat/data/type_fct.csv
@@ -0,0 +1,5 @@
+fct_chr,fct_int,ord_chr,ord_int
+red,10,red,10
+red,20,red,20
+green,10,green,10
+blue,10,blue,10
diff --git a/tests/testthat/data/type_interlaced.csv b/tests/testthat/data/type_interlaced.csv
@@ -0,0 +1,6 @@
+na_fct,na_int,na_cfct,na_none,na_default,cfct_chr,cfct_int
+1.1,1.1,1.1,1,1,a,10
+2.2,2.2,2.2,2,2,b,20
+OMITTED,-99,-99,3,OMITTED,a,10
+REFUSED,-98,-98,3,3,b,20
+5.5,5.5,5.5,4,4,a,10
diff --git a/tests/testthat/data/types.json b/tests/testthat/data/types.json
@@ -13,12 +13,7 @@
           {
             "name": "str_factor",
             "type": "string",
-            "constraints": {
-              "enum": [
-                "foo",
-                "bar"
-              ]
-            }
+            "categories": ["foo", "bar"]
           }
         ]
       }
@@ -40,13 +35,7 @@
           {
             "name": "num_factor",
             "type": "number",
-            "constraints": {
-              "enum": [
-                3.1,
-                3.2,
-                3.3
-              ]
-            }
+            "notes": "This is deprecated; factors should only be int and string"
           },
           {
             "name": "num_nan",
@@ -94,13 +83,7 @@
           {
             "name": "int_factor",
             "type": "integer",
-            "constraints": {
-              "enum": [
-                3,
-                4,
-                -1
-              ]
-            }
+            "categories": [3, 4, -1]
           },
           {
             "name": "int_ws",
@@ -294,6 +277,61 @@
           }
         ]
       }
+    },
+    {
+      "name": "interlaced",
+      "path": "type_interlaced.csv",
+      "profile": "tabular-data-resource",
+      "schema": {
+        "missingValues": ["OMITTED"],
+        "fields": [
+          {
+            "name": "na_fct",
+            "type": "number",
+            "missingValues": ["OMITTED", "REFUSED"]
+          },
+          {
+            "name": "na_int",
+            "type": "number",
+            "missingValues": ["-99", "-98"]
+          },
+          {
+            "name": "na_cfct",
+            "type": "number",
+            "missingValues": [
+              { "value": "-99", "label": "OMITTED" },
+              { "value": "-98", "label": "REFUSED" }
+            ]
+          },
+          {
+            "name": "na_none",
+            "type": "integer",
+            "missingValues": []
+          },
+          {
+            "name": "na_default",
+            "type": "number"
+          },
+          {
+            "name": "cfct_chr",
+            "type": "string",
+            "categories": [
+              { "value": "a", "label": "APPLE" },
+              { "value": "b", "label": "BANANA" }
+            ],
+            "missingValues": []
+          },
+          {
+            "name": "cfct_int",
+            "type": "integer",
+            "categories": [
+              { "value": 10, "label": "APPLE" },
+              { "value": 20, "label": "BANANA" }
+            ],
+            "missingValues": []
+          }
+        ]
+      }
     }
   ]
 }
diff --git a/tests/testthat/test-read_resource.R b/tests/testthat/test-read_resource.R
@@ -582,10 +582,10 @@ test_that("read_resource() handles strings", {
   resource <- read_resource(p, "string")
   expect_type(resource$str, "character")
 
-  # Use factor when enum is present
-  enum <- p$resources[[1]]$schema$fields[[2]]$constraints$enum
+  # Use factor when categories are present
+  categories <- p$resources[[1]]$schema$fields[[2]]$categories
   expect_s3_class(resource$str_factor, "factor")
-  expect_identical(levels(resource$str_factor), enum)
+  expect_identical(levels(resource$str_factor), categories)
 })
 
 test_that("read_resource() handles numbers", {
@@ -598,10 +598,8 @@ test_that("read_resource() handles numbers", {
   expect_type(resource$num_neg, "double")
   expect_true(all(resource$num_neg == -3))
 
-  # Use factor when enum is present
-  enum <- p$resources[[2]]$schema$fields[[3]]$constraints$enum
-  expect_s3_class(resource$num_factor, "factor")
-  expect_identical(levels(resource$num_factor), as.character(enum))
+  # Use factor when enum is present (deprecated)
+  # TODO: remove num_factor from test file
 
   # NaN, INF, -INF are supported, case-insensitive
   expect_type(resource$num_nan, "double")
@@ -631,10 +629,10 @@ test_that("read_resource() handles integers (as doubles)", {
   expect_type(resource$int_neg, "double")
   expect_true(all(resource$int_neg == -3))
 
-  # Use factor when enum is present
-  enum <- p$resources[[3]]$schema$fields[[3]]$constraints$enum
+  # Use factor when categories are present
+  categories <- p$resources[[3]]$schema$fields[[3]]$categories
   expect_s3_class(resource$int_factor, "factor")
-  expect_identical(levels(resource$int_factor), as.character(enum))
+  expect_identical(levels(resource$int_factor), as.character(categories))
 
   # bareNumber = false allows whitespace and non-numeric characters
   expect_type(resource$int_ws, "double")
@@ -738,3 +736,37 @@ test_that("read_resource() handles other types", {
   # Guess undefined types, unknown types are blocked by check_schema()
   expect_type(resource$no_type, "logical")
 })
+
+test_that("read_resource() handles interlaced types", {
+  p <- read_package(test_path("data/types.json"))
+  resource <- read_interlaced_resource(p, "interlaced")
+
+  # Interpret fct missing reasons
+  expect_s3_class(interlacer::na_channel(resource$na_fct), "factor")
+  expect_true(resource$na_fct[[3]] == interlacer::na("OMITTED"))
+
+  # Interpret int missing reasons
+  expect_type(interlacer::na_channel(resource$na_fct), "integer")
+  expect_true(resource$na_int[[3]] == interlacer::na(-99))
+
+  # Interpret cfct missing reasons
+  expect_s3_class(interlacer::na_channel(resource$na_cfct), "interlacer_cfactor")
+  expect_true(resource$na_cfct[[3]] == interlacer::na("OMITTED"))
+
+  # Interpret none missing reasons
+  expect_s3_class(resource$na_none, NA)
+
+  # Interpret default missing reasons
+  expect_s3_class(interlacer::na_channel(resource$na_default), "factor")
+  expect_true(resource$na_default[[3]] == interlacer::na("OMITTED"))
+
+  # Interpret cfct_chr
+  expect_s3_class(resource$cfct_chr, "interlacer_cfactor")
+  expect_true(resource$cfct_chr[[3]] == "APPLE")
+  expect_true(interlacer::as.codes(resource$cfct_chr[[3]]) == "a")
+
+  # Interpret cfct_int
+  expect_s3_class(resource$cfct_int, "interlacer_cfactor")
+  expect_true(resource$cfct_int[[3]] == "APPLE")
+  expect_true(interlacer::as.codes(resource$cfct_int[[3]]) == 10)
+})