Skip to content

Commit 63bce4e

Browse files
Annie Zhangmkfreeman
andauthored
Derived columns (#367)
* Basic handling of column derivations * Derive value before running __table * Add applyTypes method * Return the full schema on the source * move derivations into __table * cleanup; add comments * use hidden flag for deselected columns; rm fullSchema * add test for derive; clean up other tests * allow derived columns to reference previously derived columns * go back to using .fullSchema * refine comment * fix derivedSource; refine fullSchema; add fullSchema to tests * handle usage of renamed columns in derived formulas * catch and return runtime errors * use array instead of map for columnErrors; update unit tests --------- Co-authored-by: mkfreeman <[email protected]>
1 parent 9f01d7a commit 63bce4e

File tree

2 files changed

+162
-22
lines changed

2 files changed

+162
-22
lines changed

src/table.js

Lines changed: 76 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -627,14 +627,13 @@ export function getSchema(source) {
627627
return {schema, inferred: false};
628628
}
629629

630-
// This function applies table cell operations to an in-memory table (array of
631-
// objects); it should be equivalent to the corresponding SQL query. TODO Use
632-
// DuckDBClient for data arrays, too, and then we wouldn’t need our own __table
633-
// function to do table operations on in-memory data?
634-
export function __table(source, operations) {
630+
// This function infers a schema from the source data, if one doesn't already
631+
// exist, and merges type assertions into that schema. If the schema was
632+
// inferred or if there are type assertions, it then coerces the rows in the
633+
// source data to the types specified in the schema.
634+
function applyTypes(source, operations) {
635635
const input = source;
636636
let {schema, inferred} = getSchema(source);
637-
// Combine column types from schema with user-selected types in operations
638637
const types = new Map(schema.map(({name, type}) => [name, type]));
639638
if (operations.types) {
640639
for (const {name, type} of operations.types) {
@@ -650,6 +649,66 @@ export function __table(source, operations) {
650649
// operations.types, above.
651650
source = source.map(d => coerceRow(d, types, schema));
652651
}
652+
return {source, schema};
653+
}
654+
655+
function applyNames(source, operations) {
656+
if (!operations.names) return source;
657+
const overridesByName = new Map(operations.names.map((n) => [n.column, n]));
658+
return source.map((d) =>
659+
Object.fromEntries(Object.keys(d).map((k) => {
660+
const override = overridesByName.get(k);
661+
return [override?.name ?? k, d[k]];
662+
}))
663+
);
664+
}
665+
666+
// This function applies table cell operations to an in-memory table (array of
667+
// objects); it should be equivalent to the corresponding SQL query. TODO Use
668+
// DuckDBClient for data arrays, too, and then we wouldn’t need our own __table
669+
// function to do table operations on in-memory data?
670+
export function __table(source, operations) {
671+
const errors = new Map();
672+
const input = source;
673+
const typed = applyTypes(source, operations);
674+
source = typed.source;
675+
let schema = typed.schema;
676+
if (operations.derive) {
677+
// Derived columns may depend on coerced values from the original data source,
678+
// so we must evaluate derivations after the initial inference and coercion
679+
// step.
680+
const derivedSource = [];
681+
operations.derive.map(({name, value}) => {
682+
let columnErrors = [];
683+
// Derived column formulas may reference renamed columns, so we must
684+
// compute derivations on the renamed source. However, we don't modify the
685+
// source itself with renamed names until after the other operations are
686+
// applied, because operations like filter and sort reference original
687+
// column names.
688+
// TODO Allow derived columns to reference other derived columns.
689+
applyNames(source, operations).map((row, index, rows) => {
690+
let resolved;
691+
try {
692+
resolved = value(row, index, rows);
693+
} catch (error) {
694+
columnErrors.push({index, error});
695+
resolved = undefined;
696+
}
697+
if (derivedSource[index]) {
698+
derivedSource[index] = {...derivedSource[index], [name]: resolved};
699+
} else {
700+
derivedSource.push({[name]: resolved});
701+
}
702+
});
703+
if (columnErrors.length) errors.set(name, columnErrors);
704+
});
705+
// Since derived columns are untyped by default, we do a pass of type
706+
// inference and coercion after computing the derived values.
707+
const typedDerived = applyTypes(derivedSource, operations);
708+
// Merge derived source and schema with the source dataset.
709+
source = source.map((row, i) => ({...row, ...typedDerived.source[i]}));
710+
schema = [...schema, ...typedDerived.schema];
711+
}
653712
for (const {type, operands} of operations.filter) {
654713
const [{value: column}] = operands;
655714
const values = operands.slice(1).map(({value}) => value);
@@ -750,6 +809,8 @@ export function __table(source, operations) {
750809
if (from > 0 || to < Infinity) {
751810
source = source.slice(Math.max(0, from), Math.max(0, to));
752811
}
812+
// Preserve the schema for all columns.
813+
let fullSchema = schema.slice();
753814
if (operations.select.columns) {
754815
if (schema) {
755816
const schemaByName = new Map(schema.map((s) => [s.name, s]));
@@ -767,16 +828,19 @@ export function __table(source, operations) {
767828
return ({...s, ...(override ? {name: override.name} : null)});
768829
});
769830
}
770-
source = source.map((d) =>
771-
Object.fromEntries(Object.keys(d).map((k) => {
772-
const override = overridesByName.get(k);
773-
return [override?.name ?? k, d[k]];
774-
}))
775-
);
831+
if (fullSchema) {
832+
fullSchema = fullSchema.map((s) => {
833+
const override = overridesByName.get(s.name);
834+
return ({...s, ...(override ? {name: override.name} : null)});
835+
});
836+
}
837+
source = applyNames(source, operations);
776838
}
777839
if (source !== input) {
778840
if (schema) source.schema = schema;
779841
}
842+
source.fullSchema = fullSchema;
843+
source.errors = errors;
780844
return source;
781845
}
782846

test/table-test.js

Lines changed: 86 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,8 @@ describe("__table", () => {
504504
};
505505
const expectedEmpty = [{}, {}, {}];
506506
expectedEmpty.schema = [];
507+
expectedEmpty.fullSchema = source.schema;
508+
expectedEmpty.errors = new Map();
507509
assert.deepStrictEqual(
508510
__table(source, operationsEmptyColumns),
509511
expectedEmpty
@@ -514,6 +516,8 @@ describe("__table", () => {
514516
};
515517
const expectedSelected = [{a: 1}, {a: 2}, {a: 3}];
516518
expectedSelected.schema = [{name: "a", type: "integer", inferred: "integer"}];
519+
expectedSelected.fullSchema = source.schema;
520+
expectedSelected.errors = new Map();
517521
assert.deepStrictEqual(
518522
__table(source, operationsSelectedColumns),
519523
expectedSelected
@@ -546,6 +550,8 @@ describe("__table", () => {
546550
};
547551
const expectedEq = [{a: 1, b: 2, c: 3}];
548552
expectedEq.schema = source.schema;
553+
expectedEq.fullSchema = source.schema;
554+
expectedEq.errors = new Map();
549555
assert.deepStrictEqual(__table(source, operationsEquals), expectedEq);
550556
const operationsComparison = {
551557
...EMPTY_TABLE_DATA.operations,
@@ -568,6 +574,8 @@ describe("__table", () => {
568574
};
569575
const expectedLtGt = [{a: 2, b: 4, c: 6}];
570576
expectedLtGt.schema = source.schema;
577+
expectedLtGt.fullSchema = source.schema;
578+
expectedLtGt.errors = new Map();
571579
assert.deepStrictEqual(__table(source, operationsComparison), expectedLtGt);
572580
});
573581

@@ -586,6 +594,8 @@ describe("__table", () => {
586594
};
587595
const expectedEq = [{a: 1, b: 2, c: 3}];
588596
expectedEq.schema = source.schema;
597+
expectedEq.fullSchema = source.schema;
598+
expectedEq.errors = new Map();
589599
assert.deepStrictEqual(__table(source, operationsEquals), expectedEq);
590600
const operationsComparison = {
591601
...EMPTY_TABLE_DATA.operations,
@@ -608,6 +618,8 @@ describe("__table", () => {
608618
};
609619
const expectedLteGte = [{a: 2, b: 4, c: 6}];
610620
expectedLteGte.schema = source.schema;
621+
expectedLteGte.fullSchema = source.schema;
622+
expectedLteGte.errors = new Map();
611623
assert.deepStrictEqual(
612624
__table(source, operationsComparison),
613625
expectedLteGte
@@ -634,6 +646,8 @@ describe("__table", () => {
634646
];
635647
const expected = [{a: new Date("2021-01-02")}];
636648
expected.schema = [{name: "a", type: "date", inferred: "date"}];
649+
expected.fullSchema = expected.schema;
650+
expected.errors = new Map();
637651
assert.deepStrictEqual(__table(source, operationsEquals), expected);
638652
});
639653

@@ -648,6 +662,8 @@ describe("__table", () => {
648662
{a: 1, b: 2, c: 3}
649663
];
650664
expectedDesc.schema = source.schema;
665+
expectedDesc.fullSchema = source.schema;
666+
expectedDesc.errors = new Map();
651667
assert.deepStrictEqual(__table(source, operationsDesc), expectedDesc);
652668
const operationsAsc = {
653669
...EMPTY_TABLE_DATA.operations,
@@ -659,6 +675,8 @@ describe("__table", () => {
659675
{a: 3, b: 6, c: 9}
660676
];
661677
expectedAsc.schema = source.schema;
678+
expectedAsc.fullSchema = source.schema;
679+
expectedAsc.errors = new Map();
662680
assert.deepStrictEqual(__table(source, operationsAsc), expectedAsc);
663681
const sourceExtended = [...source, {a: 1, b: 3, c: 3}, {a: 1, b: 5, c: 3}];
664682
const operationsMulti = {
@@ -676,6 +694,8 @@ describe("__table", () => {
676694
{a: 1, b: 2, c: 3}
677695
];
678696
expectedExtended.schema = source.schema;
697+
expectedExtended.fullSchema = source.schema;
698+
expectedExtended.errors = new Map();
679699
assert.deepStrictEqual(
680700
__table(sourceExtended, operationsMulti),
681701
expectedExtended
@@ -694,6 +714,8 @@ describe("__table", () => {
694714
{a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: NaN}, {a: NaN}, {a: NaN}, {a: NaN}
695715
];
696716
expectedDesc.schema = [{name: "a", type: "number", inferred: "number"}];
717+
expectedDesc.fullSchema = expectedDesc.schema;
718+
expectedDesc.errors = new Map();
697719
assert.deepStrictEqual(
698720
__table(sourceWithMissing, operationsDesc),
699721
expectedDesc
@@ -706,6 +728,8 @@ describe("__table", () => {
706728
{a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: NaN}, {a: NaN}, {a: NaN}, {a: NaN}
707729
];
708730
expectedAsc.schema = [{name: "a", type: "number", inferred: "number"}];
731+
expectedAsc.fullSchema = expectedAsc.schema;
732+
expectedAsc.errors = new Map();
709733
assert.deepStrictEqual(
710734
__table(sourceWithMissing, operationsAsc),
711735
expectedAsc
@@ -723,6 +747,8 @@ describe("__table", () => {
723747
{a: 1, b: 2, c: 3}
724748
];
725749
sorted.schema = source.schema;
750+
sorted.fullSchema = source.schema;
751+
sorted.errors = new Map();
726752
assert.deepStrictEqual(__table(source, operations), sorted);
727753
const originalOrder = [
728754
{a: 1, b: 2, c: 3},
@@ -743,13 +769,17 @@ describe("__table", () => {
743769
{a: 3, b: 6, c: 9}
744770
];
745771
expectedToNull.schema = source.schema;
772+
expectedToNull.fullSchema = source.schema;
773+
expectedToNull.errors = new Map();
746774
assert.deepStrictEqual(__table(source, operationsToNull), expectedToNull);
747775
const operationsFromNull = {
748776
...EMPTY_TABLE_DATA.operations,
749777
slice: {from: null, to: 1}
750778
};
751779
const expectedFromNull = [{a: 1, b: 2, c: 3}];
752780
expectedFromNull.schema = source.schema;
781+
expectedFromNull.fullSchema = source.schema;
782+
expectedFromNull.errors = new Map();
753783
assert.deepStrictEqual(
754784
__table(source, operationsFromNull),
755785
expectedFromNull
@@ -760,6 +790,8 @@ describe("__table", () => {
760790
};
761791
const expectedSlice = [{a: 2, b: 4, c: 6}];
762792
expectedSlice.schema = source.schema;
793+
expectedSlice.fullSchema = source.schema;
794+
expectedSlice.errors = new Map();
763795
assert.deepStrictEqual(__table(source, operations), expectedSlice);
764796
});
765797

@@ -794,18 +826,16 @@ describe("__table", () => {
794826
{nameA: 2, b: 4, c: 6},
795827
{nameA: 3, b: 6, c: 9}
796828
];
797-
expected.schema = [
829+
const schema = [
798830
{name: "nameA", type: "integer", inferred: "integer"},
799831
{name: "b", type: "integer", inferred: "integer"},
800832
{name: "c", type: "integer", inferred: "integer"}
801833
];
834+
expected.schema = schema;
835+
expected.fullSchema = schema;
836+
expected.errors = new Map();
802837
assert.deepStrictEqual(__table(source, operations), expected);
803838
source.columns = ["a", "b", "c"];
804-
assert.deepStrictEqual(__table(source, operations).schema, [
805-
{name: "nameA", type: "integer", inferred: "integer"},
806-
{name: "b", type: "integer", inferred: "integer"},
807-
{name: "c", type: "integer", inferred: "integer"}
808-
]);
809839
});
810840

811841
it("__table type assertions", () => {
@@ -823,13 +853,59 @@ describe("__table", () => {
823853
{name: "b", type: "integer", inferred: "integer"},
824854
{name: "c", type: "integer", inferred: "integer"}
825855
];
856+
expected.fullSchema = expected.schema;
857+
expected.errors = new Map();
826858
assert.deepStrictEqual(__table(source, operations), expected);
827859
source.columns = ["a", "b", "c"];
828-
assert.deepStrictEqual(__table(source, operations).schema, [
829-
{name: "a", type: "string", inferred: "integer"},
860+
});
861+
862+
it("__table derived columns", () => {
863+
const operations = {
864+
...EMPTY_TABLE_DATA.operations,
865+
derive: [{name: "d", value: (row) => row.a ** 2}]
866+
};
867+
const expected = [
868+
{a: 1, b: 2, c: 3, d: 1},
869+
{a: 2, b: 4, c: 6, d: 4},
870+
{a: 3, b: 6, c: 9, d: 9}
871+
];
872+
expected.schema = [
873+
{name: "a", type: "integer", inferred: "integer"},
830874
{name: "b", type: "integer", inferred: "integer"},
831-
{name: "c", type: "integer", inferred: "integer"}
832-
]);
875+
{name: "c", type: "integer", inferred: "integer"},
876+
{name: "d", type: "integer", inferred: "integer"}
877+
];
878+
expected.fullSchema = expected.schema;
879+
expected.errors = new Map();
880+
assert.deepStrictEqual(__table(source, operations), expected);
881+
});
882+
883+
it("__table derived columns with errors", () => {
884+
const functionWithError = (row) => row.a.b.c;
885+
const operations = {
886+
...EMPTY_TABLE_DATA.operations,
887+
derive: [{name: "d", value: functionWithError}]
888+
};
889+
let error;
890+
try {
891+
functionWithError(source[0]);
892+
} catch (e) {
893+
error = e;
894+
}
895+
const expected = [
896+
{a: 1, b: 2, c: 3, d: undefined},
897+
{a: 2, b: 4, c: 6, d: undefined},
898+
{a: 3, b: 6, c: 9, d: undefined}
899+
];
900+
expected.schema = [
901+
{name: "a", type: "integer", inferred: "integer"},
902+
{name: "b", type: "integer", inferred: "integer"},
903+
{name: "c", type: "integer", inferred: "integer"},
904+
{name: "d", type: "other", inferred: "other"}
905+
];
906+
expected.fullSchema = expected.schema;
907+
expected.errors = new Map([["d", [{index: 0, error}, {index: 1, error}, {index: 2, error}]]]);
908+
assert.deepStrictEqual(__table(source, operations), expected);
833909
});
834910
});
835911

0 commit comments

Comments
 (0)