Skip to content

Commit fa1b356

Browse files
Infer schemas and coerce data for table cells (#346)
1 parent 3afb7c7 commit fa1b356

File tree

2 files changed

+768
-86
lines changed

2 files changed

+768
-86
lines changed

src/table.js

Lines changed: 196 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import {reverse} from "d3-array";
1+
import {greatest, reverse} from "d3-array";
22
import {FileAttachment} from "./fileAttachment.js";
33
import {isArqueroTable} from "./arquero.js";
44
import {isArrowTable, loadArrow} from "./arrow.js";
@@ -66,13 +66,20 @@ function objectHasEnumerableKeys(value) {
6666
}
6767

6868
function isQueryResultSetSchema(schemas) {
69-
return (Array.isArray(schemas) && schemas.every((s) => s && typeof s.name === "string"));
69+
return (
70+
Array.isArray(schemas) &&
71+
schemas.every(isColumnSchema)
72+
);
7073
}
7174

7275
function isQueryResultSetColumns(columns) {
7376
return (Array.isArray(columns) && columns.every((name) => typeof name === "string"));
7477
}
7578

79+
function isColumnSchema(schema) {
80+
return schema && typeof schema.name === "string" && typeof schema.type === "string";
81+
}
82+
7683
// Returns true if the value represents an array of primitives (i.e., a
7784
// single-column table). This should only be passed values for which
7885
// isDataArray returns true.
@@ -191,15 +198,17 @@ function sourceCache(loadSource) {
191198
const loadTableDataSource = sourceCache(async (source, name) => {
192199
if (source instanceof FileAttachment) {
193200
switch (source.mimeType) {
194-
case "text/csv": return source.csv({typed: true});
195-
case "text/tab-separated-values": return source.tsv({typed: true});
201+
case "text/csv": return source.csv();
202+
case "text/tab-separated-values": return source.tsv();
196203
case "application/json": return source.json();
197204
case "application/x-sqlite3": return source.sqlite();
198205
}
199206
if (/\.(arrow|parquet)$/i.test(source.name)) return loadDuckDBClient(source, name);
200207
throw new Error(`unsupported file type: ${source.mimeType}`);
201208
}
202209
if (isArrowTable(source) || isArqueroTable(source)) return loadDuckDBClient(source, name);
210+
if (isDataArray(source) && arrayIsPrimitive(source))
211+
return Array.from(source, (value) => ({value}));
203212
return source;
204213
});
205214

@@ -542,15 +551,84 @@ export function getTypeValidator(colType) {
542551
}
543552
}
544553

554+
// Accepts dates in the form of ISOString and LocaleDateString, with or without time
555+
const DATE_TEST = /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2}))|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/;
556+
557+
export function coerceToType(value, type) {
558+
switch (type) {
559+
case "string":
560+
return typeof value === "string" || value == null ? value : String(value);
561+
case "boolean":
562+
if (typeof value === "string") {
563+
const trimValue = value.trim().toLowerCase();
564+
return trimValue === "true"
565+
? true
566+
: trimValue === "false"
567+
? false
568+
: null;
569+
}
570+
return typeof value === "boolean" || value == null
571+
? value
572+
: Boolean(value);
573+
case "bigint":
574+
return typeof value === "bigint" || value == null
575+
? value
576+
: Number.isInteger(typeof value === "string" && !value.trim() ? NaN : +value)
577+
? BigInt(value) // eslint-disable-line no-undef
578+
: undefined;
579+
case "integer": // not a target type for coercion, but can be inferred
580+
case "number": {
581+
return typeof value === "number"
582+
? value
583+
: value == null || (typeof value === "string" && !value.trim())
584+
? NaN
585+
: Number(value);
586+
}
587+
case "date": {
588+
if (value instanceof Date || value == null) return value;
589+
if (typeof value === "number") return new Date(value);
590+
const trimValue = String(value).trim();
591+
if (typeof value === "string" && !trimValue) return null;
592+
return new Date(DATE_TEST.test(trimValue) ? trimValue : NaN);
593+
}
594+
case "array":
595+
case "object":
596+
case "buffer":
597+
case "other":
598+
return value;
599+
default:
600+
throw new Error(`Unable to coerce to type: ${type}`);
601+
}
602+
}
603+
545604
// This function applies table cell operations to an in-memory table (array of
546605
// objects); it should be equivalent to the corresponding SQL query. TODO Use
547606
// DuckDBClient for data arrays, too, and then we wouldn’t need our own __table
548607
// function to do table operations on in-memory data?
549608
export function __table(source, operations) {
550609
const input = source;
551610
let {schema, columns} = source;
552-
let primitive = arrayIsPrimitive(source);
553-
if (primitive) source = Array.from(source, (value) => ({value}));
611+
let inferredSchema = false;
612+
if (!isQueryResultSetSchema(schema)) {
613+
schema = inferSchema(source, columns);
614+
inferredSchema = true;
615+
}
616+
// Combine column types from schema with user-selected types in operations
617+
const types = new Map(schema.map(({name, type}) => [name, type]));
618+
if (operations.type) {
619+
for (const {name, type} of operations.type) {
620+
types.set(name, type);
621+
// update schema with user-selected type
622+
if (schema === input.schema) schema = schema.slice(); // copy on write
623+
const colIndex = schema.findIndex((col) => col.name === name);
624+
if (colIndex > -1) schema[colIndex] = {...schema[colIndex], type};
625+
}
626+
source = source.map(d => coerceRow(d, types, schema));
627+
} else if (inferredSchema) {
628+
// Coerce data according to new schema, unless that happened due to
629+
// operations.type, above.
630+
source = source.map(d => coerceRow(d, types, schema));
631+
}
554632
for (const {type, operands} of operations.filter) {
555633
const [{value: column}] = operands;
556634
const values = operands.slice(1).map(({value}) => value);
@@ -663,7 +741,7 @@ export function __table(source, operations) {
663741
Object.fromEntries(operations.select.columns.map((c) => [c, d[c]]))
664742
);
665743
}
666-
if (!primitive && operations.names) {
744+
if (operations.names) {
667745
const overridesByName = new Map(operations.names.map((n) => [n.column, n]));
668746
if (schema) {
669747
schema = schema.map((s) => {
@@ -684,10 +762,120 @@ export function __table(source, operations) {
684762
}))
685763
);
686764
}
687-
if (primitive) source = source.map((d) => d.value);
688765
if (source !== input) {
689766
if (schema) source.schema = schema;
690767
if (columns) source.columns = columns;
691768
}
692769
return source;
693770
}
771+
772+
function coerceRow(object, types, schema) {
773+
const coerced = {};
774+
for (const col of schema) {
775+
const type = types.get(col.name);
776+
const value = object[col.name];
777+
coerced[col.name] = type === "raw" ? value : coerceToType(value, type);
778+
}
779+
return coerced;
780+
}
781+
782+
function createTypeCount() {
783+
return {
784+
boolean: 0,
785+
integer: 0,
786+
number: 0,
787+
date: 0,
788+
string: 0,
789+
array: 0,
790+
object: 0,
791+
bigint: 0,
792+
buffer: 0,
793+
defined: 0
794+
};
795+
}
796+
797+
// Caution: the order below matters! 🌶️ The first one that passes the ≥90% test
798+
// should be the one that we chose, and therefore these types should be listed
799+
// from most specific to least specific.
800+
const types = [
801+
"boolean",
802+
"integer",
803+
"number",
804+
"date",
805+
"bigint",
806+
"array",
807+
"object",
808+
"buffer"
809+
// Note: "other" and "string" are intentionally omitted; see below!
810+
];
811+
812+
// We need to show *all* keys present in the array of Objects
813+
function getAllKeys(rows) {
814+
const keys = new Set();
815+
for (const row of rows) {
816+
// avoid crash if row is null or undefined
817+
if (row) {
818+
// only enumerable properties
819+
for (const key in row) {
820+
// only own properties
821+
if (Object.prototype.hasOwnProperty.call(row, key)) {
822+
// unique properties, in the order they appear
823+
keys.add(key);
824+
}
825+
}
826+
}
827+
}
828+
return Array.from(keys);
829+
}
830+
831+
export function inferSchema(source, columns = getAllKeys(source)) {
832+
const schema = [];
833+
const sampleSize = 100;
834+
const sample = source.slice(0, sampleSize);
835+
const typeCounts = {};
836+
for (const col of columns) {
837+
const colCount = typeCounts[col] = createTypeCount();
838+
for (const d of sample) {
839+
let value = d[col];
840+
if (value == null) continue;
841+
const type = typeof value;
842+
if (type !== "string") {
843+
++colCount.defined;
844+
if (Array.isArray(value)) ++colCount.array;
845+
else if (value instanceof Date) ++colCount.date;
846+
else if (value instanceof ArrayBuffer) ++colCount.buffer;
847+
else if (type === "number") {
848+
++colCount.number;
849+
if (Number.isInteger(value)) ++colCount.integer;
850+
}
851+
// bigint, boolean, or object
852+
else if (type in colCount) ++colCount[type];
853+
} else {
854+
value = value.trim();
855+
if (!value) continue;
856+
++colCount.defined;
857+
++colCount.string;
858+
if (/^(true|false)$/i.test(value)) {
859+
++colCount.boolean;
860+
} else if (value && !isNaN(value)) {
861+
++colCount.number;
862+
if (Number.isInteger(+value)) ++colCount.integer;
863+
} else if (DATE_TEST.test(value)) ++colCount.date;
864+
}
865+
}
866+
// Chose the non-string, non-other type with the greatest count that is also
867+
// ≥90%; or if no such type meets that criterion, fallback to string if
868+
// ≥90%; and lastly fallback to other.
869+
const minCount = Math.max(1, colCount.defined * 0.9);
870+
const type =
871+
greatest(types, (type) =>
872+
colCount[type] >= minCount ? colCount[type] : NaN
873+
) ?? (colCount.string >= minCount ? "string" : "other");
874+
schema.push({
875+
name: col,
876+
type: type,
877+
inferred: type
878+
});
879+
}
880+
return schema;
881+
}

0 commit comments

Comments
 (0)