Skip to content

Commit cea35d7

Browse files
authored
use DuckDB for more data sources (#324)
* use DuckDB for more data sources * friendly table name for SQL cells * fix for table names that need escaping * friendly name for tables, too * friendlier generated SQL * escape the table name, too * support primitive arrays * fix FLOAT type * cache data sources * TSV for DuckDB
1 parent c8c153c commit cea35d7

File tree

4 files changed

+115
-51
lines changed

4 files changed

+115
-51
lines changed

src/arrow.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import {arrow9 as arrow} from "./dependencies.js";
2+
import {cdn} from "./require.js";
3+
14
// Returns true if the vaue is an Apache Arrow table. This uses a “duck” test
25
// (instead of strict instanceof) because we want it to work with a range of
36
// Apache Arrow versions at least 7.0.0 or above.
@@ -56,3 +59,7 @@ function getArrowType(type) {
5659
return "other";
5760
}
5861
}
62+
63+
export async function loadArrow() {
64+
return await import(`${cdn}${arrow.resolve()}`);
65+
}

src/duckdb.js

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import {getArrowTableSchema, isArrowTable} from "./arrow.js";
2-
import {arrow9 as arrow, duckdb} from "./dependencies.js";
1+
import {getArrowTableSchema, isArrowTable, loadArrow} from "./arrow.js";
2+
import {duckdb} from "./dependencies.js";
33
import {FileAttachment} from "./fileAttachment.js";
44
import {cdn} from "./require.js";
55

@@ -111,7 +111,7 @@ export class DuckDBClient {
111111
}
112112

113113
async describeColumns({table} = {}) {
114-
const columns = await this.query(`DESCRIBE ${table}`);
114+
const columns = await this.query(`DESCRIBE ${this.escape(table)}`);
115115
return columns.map(({column_name, column_type, null: nullable}) => ({
116116
name: column_name,
117117
type: getDuckDBType(column_type),
@@ -169,9 +169,11 @@ async function insertFile(database, name, file, options) {
169169
try {
170170
switch (file.mimeType) {
171171
case "text/csv":
172+
case "text/tab-separated-values":
172173
return await connection.insertCSVFromPath(file.name, {
173174
name,
174175
schema: "main",
176+
delimiter: file.mimeType === "text/csv" ? "," : "\t",
175177
...options
176178
});
177179
case "application/json":
@@ -202,11 +204,9 @@ async function insertFile(database, name, file, options) {
202204
}
203205

204206
async function insertArrowTable(database, name, table, options) {
205-
const arrow = await loadArrow();
206-
const buffer = arrow.tableToIPC(table);
207207
const connection = await database.connect();
208208
try {
209-
await connection.insertArrowFromIPCStream(buffer, {
209+
await connection.insertArrowTable(table, {
210210
name,
211211
schema: "main",
212212
...options
@@ -241,10 +241,6 @@ async function createDuckDB() {
241241
return db;
242242
}
243243

244-
async function loadArrow() {
245-
return await import(`${cdn}${arrow.resolve()}`);
246-
}
247-
248244
// https://duckdb.org/docs/sql/data_types/overview
249245
function getDuckDBType(type) {
250246
switch (type) {
@@ -254,6 +250,7 @@ function getDuckDBType(type) {
254250
return "bigint";
255251
case "DOUBLE":
256252
case "REAL":
253+
case "FLOAT":
257254
return "number";
258255
case "INTEGER":
259256
case "SMALLINT":

src/table.js

Lines changed: 86 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import {ascending, descending, reverse} from "d3-array";
22
import {FileAttachment} from "./fileAttachment.js";
3-
import {isArrowTable} from "./arrow.js";
3+
import {isArrowTable, loadArrow} from "./arrow.js";
44
import {DuckDBClient} from "./duckdb.js";
55

66
const nChecks = 20; // number of values to check in each array
@@ -143,43 +143,100 @@ function isTypedArray(value) {
143143

144144
// __query is used by table cells; __query.sql is used by SQL cells.
145145
export const __query = Object.assign(
146-
async (source, operations, invalidation) => {
147-
source = await loadDataSource(await source, "table");
146+
async (source, operations, invalidation, name) => {
147+
source = await loadTableDataSource(await source, name);
148148
if (isDatabaseClient(source)) return evaluateQuery(source, makeQueryTemplate(operations, source), invalidation);
149149
if (isDataArray(source)) return __table(source, operations);
150150
if (!source) throw new Error("missing data source");
151151
throw new Error("invalid data source");
152152
},
153153
{
154-
sql(source, invalidation) {
154+
sql(source, invalidation, name) {
155155
return async function () {
156-
return evaluateQuery(await loadDataSource(await source, "sql"), arguments, invalidation);
156+
return evaluateQuery(await loadSqlDataSource(await source, name), arguments, invalidation);
157157
};
158158
}
159159
}
160160
);
161161

162-
export async function loadDataSource(source, mode) {
163-
if (source instanceof FileAttachment) {
164-
if (mode === "table") {
165-
switch (source.mimeType) {
166-
case "text/csv": return source.csv({typed: true});
167-
case "text/tab-separated-values": return source.tsv({typed: true});
168-
case "application/json": return source.json();
169-
}
162+
export async function loadDataSource(source, mode, name) {
163+
switch (mode) {
164+
case "table": return loadTableDataSource(source, name);
165+
case "sql": return loadSqlDataSource(source, name);
166+
}
167+
return source;
168+
}
169+
170+
// We use a weak map to cache loaded data sources by key so that we don’t have
171+
// to e.g. create separate SQLiteDatabaseClients every time we’re querying the
172+
// same SQLite file attachment. Since this is a weak map, unused references will
173+
// be garbage collected when they are no longer desired. Note: the name should
174+
// be consistent, as it is not part of the cache key!
175+
function sourceCache(loadSource) {
176+
const cache = new WeakMap();
177+
return (source, name) => {
178+
if (!source) throw new Error("data source not found");
179+
let promise = cache.get(source);
180+
if (!promise) {
181+
// Warning: do not await here! We need to populate the cache synchronously.
182+
promise = loadSource(source, name);
183+
cache.set(source, promise);
170184
}
171-
if (mode === "table" || mode === "sql") {
172-
switch (source.mimeType) {
173-
case "application/x-sqlite3": return source.sqlite();
174-
}
175-
if (/\.arrow$/i.test(source.name)) return DuckDBClient.of({__table: await source.arrow({version: 9})});
185+
return promise;
186+
};
187+
}
188+
189+
const loadTableDataSource = sourceCache(async (source, name) => {
190+
if (source instanceof FileAttachment) {
191+
switch (source.mimeType) {
192+
case "text/csv": return source.csv({typed: true});
193+
case "text/tab-separated-values": return source.tsv({typed: true});
194+
case "application/json": return source.json();
195+
case "application/x-sqlite3": return source.sqlite();
176196
}
197+
if (/\.(arrow|parquet)$/i.test(source.name)) return loadDuckDBClient(source, name);
177198
throw new Error(`unsupported file type: ${source.mimeType}`);
178199
}
179-
if ((mode === "table" || mode === "sql") && isArrowTable(source)) {
180-
return DuckDBClient.of({__table: source});
200+
if (isArrowTable(source)) return loadDuckDBClient(source, name);
201+
return source;
202+
});
203+
204+
const loadSqlDataSource = sourceCache(async (source, name) => {
205+
if (source instanceof FileAttachment) {
206+
switch (source.mimeType) {
207+
case "text/csv":
208+
case "text/tab-separated-values":
209+
case "application/json": return loadDuckDBClient(source, name);
210+
case "application/x-sqlite3": return source.sqlite();
211+
}
212+
if (/\.(arrow|parquet)$/i.test(source.name)) return loadDuckDBClient(source, name);
213+
throw new Error(`unsupported file type: ${source.mimeType}`);
181214
}
215+
if (isDataArray(source)) return loadDuckDBClient(await asArrowTable(source, name), name);
216+
if (isArrowTable(source)) return loadDuckDBClient(source, name);
182217
return source;
218+
});
219+
220+
async function asArrowTable(array, name) {
221+
const arrow = await loadArrow();
222+
return arrayIsPrimitive(array)
223+
? arrow.tableFromArrays({[name]: array})
224+
: arrow.tableFromJSON(array);
225+
}
226+
227+
function loadDuckDBClient(
228+
source,
229+
name = source instanceof FileAttachment
230+
? getFileSourceName(source)
231+
: "__table"
232+
) {
233+
return DuckDBClient.of({[name]: source});
234+
}
235+
236+
function getFileSourceName(file) {
237+
return file.name
238+
.replace(/@\d+(?=\.|$)/, "") // strip Observable file version number
239+
.replace(/\.\w+$/, ""); // strip file extension
183240
}
184241

185242
async function evaluateQuery(source, args, invalidation) {
@@ -248,9 +305,9 @@ export function makeQueryTemplate(operations, source) {
248305
throw new Error("missing from table");
249306
if (select.columns && select.columns.length === 0)
250307
throw new Error("at least one column must be selected");
251-
const columns = select.columns ? select.columns.map((c) => `t.${escaper(c)}`) : "*";
308+
const columns = select.columns ? select.columns.map(escaper).join(", ") : "*";
252309
const args = [
253-
[`SELECT ${columns} FROM ${formatTable(from.table, escaper)} t`]
310+
[`SELECT ${columns} FROM ${formatTable(from.table, escaper)}`]
254311
];
255312
for (let i = 0; i < filter.length; ++i) {
256313
appendSql(i ? `\nAND ` : `\nWHERE `, args);
@@ -303,8 +360,9 @@ function formatTable(table, escaper) {
303360
if (table.schema != null) from += escaper(table.schema) + ".";
304361
from += escaper(table.table);
305362
return from;
363+
} else {
364+
return escaper(table);
306365
}
307-
return table;
308366
}
309367

310368
function appendSql(sql, args) {
@@ -313,7 +371,7 @@ function appendSql(sql, args) {
313371
}
314372

315373
function appendOrderBy({column, direction}, args, escaper) {
316-
appendSql(`t.${escaper(column)} ${direction.toUpperCase()}`, args);
374+
appendSql(`${escaper(column)} ${direction.toUpperCase()}`, args);
317375
}
318376

319377
function appendWhereEntry({type, operands}, args, escaper) {
@@ -398,7 +456,7 @@ function appendWhereEntry({type, operands}, args, escaper) {
398456

399457
function appendOperand(o, args, escaper) {
400458
if (o.type === "column") {
401-
appendSql(`t.${escaper(o.value)}`, args);
459+
appendSql(escaper(o.value), args);
402460
} else {
403461
args.push(o.value);
404462
args[0].push("");
@@ -421,7 +479,9 @@ function likeOperand(operand) {
421479
}
422480

423481
// This function applies table cell operations to an in-memory table (array of
424-
// objects); it should be equivalent to the corresponding SQL query.
482+
// objects); it should be equivalent to the corresponding SQL query. TODO Use
483+
// DuckDBClient for data arrays, too, and then we wouldn’t need our own __table
484+
// function to do table operations on in-memory data?
425485
export function __table(source, operations) {
426486
const input = source;
427487
let {schema, columns} = source;

test/table-test.js

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ describe("makeQueryTemplate", () => {
5050
it("makeQueryTemplate select all", () => {
5151
const source = {name: "db", dialect: "postgres"};
5252
const operationsColumnsNull = {...baseOperations, select: {columns: null}};
53-
assert.deepStrictEqual(makeQueryTemplate(operationsColumnsNull, source), [["SELECT * FROM table1 t"]]);
53+
assert.deepStrictEqual(makeQueryTemplate(operationsColumnsNull, source), [["SELECT * FROM table1"]]);
5454
});
5555

5656
it("makeQueryTemplate invalid filter operation", () => {
@@ -102,7 +102,7 @@ describe("makeQueryTemplate", () => {
102102
};
103103

104104
const [parts, ...params] = makeQueryTemplate(operations, source);
105-
assert.deepStrictEqual(parts.join("?"), "SELECT t.col1,t.col2 FROM table1 t\nWHERE t.col1 = ?");
105+
assert.deepStrictEqual(parts.join("?"), "SELECT col1, col2 FROM table1\nWHERE col1 = ?");
106106
assert.deepStrictEqual(params, ["val1"]);
107107
});
108108

@@ -124,7 +124,7 @@ describe("makeQueryTemplate", () => {
124124
const [parts, ...params] = makeQueryTemplate(operations, source);
125125
assert.deepStrictEqual(
126126
parts.join("?"),
127-
"SELECT t._col1_,t._col2_ FROM table1 t\nWHERE t._col2_ = ?"
127+
"SELECT _col1_, _col2_ FROM _table1_\nWHERE _col2_ = ?"
128128
);
129129
assert.deepStrictEqual(params, ["val1"]);
130130
});
@@ -148,7 +148,7 @@ describe("makeQueryTemplate", () => {
148148
const [parts, ...params] = makeQueryTemplate(operations, source);
149149
assert.deepStrictEqual(
150150
parts.join("?"),
151-
"SELECT t._col1_,t._col2_ FROM table1 t\nWHERE t._col2_ = ?"
151+
"SELECT _col1_, _col2_ FROM _table1_\nWHERE _col2_ = ?"
152152
);
153153
assert.deepStrictEqual(params, ["val1"]);
154154
});
@@ -178,7 +178,7 @@ describe("makeQueryTemplate", () => {
178178
};
179179

180180
const [parts, ...params] = makeQueryTemplate(operations, source);
181-
assert.deepStrictEqual(parts.join("?"), "SELECT t.col1,t.col2 FROM table1 t\nWHERE t.col1 IN (?,?,?)\nAND t.col1 NOT IN (?)");
181+
assert.deepStrictEqual(parts.join("?"), "SELECT col1, col2 FROM table1\nWHERE col1 IN (?,?,?)\nAND col1 NOT IN (?)");
182182
assert.deepStrictEqual(params, ["val1", "val2", "val3", "val4"]);
183183
});
184184

@@ -192,7 +192,7 @@ describe("makeQueryTemplate", () => {
192192
};
193193

194194
const [parts, ...params] = makeQueryTemplate(operations, source);
195-
assert.deepStrictEqual(parts.join("?"), "SELECT t.col1,t.col2,t.col3 FROM table1 t");
195+
assert.deepStrictEqual(parts.join("?"), "SELECT col1, col2, col3 FROM table1");
196196
assert.deepStrictEqual(params, []);
197197
});
198198

@@ -207,7 +207,7 @@ describe("makeQueryTemplate", () => {
207207
};
208208

209209
const [parts, ...params] = makeQueryTemplate(operations, source);
210-
assert.deepStrictEqual(parts.join("?"), "SELECT t.col1,t.col2 FROM table1 t\nORDER BY t.col1 ASC, t.col2 DESC");
210+
assert.deepStrictEqual(parts.join("?"), "SELECT col1, col2 FROM table1\nORDER BY col1 ASC, col2 DESC");
211211
assert.deepStrictEqual(params, []);
212212
});
213213

@@ -224,7 +224,7 @@ describe("makeQueryTemplate", () => {
224224
const [parts, ...params] = makeQueryTemplate(operations, source);
225225
assert.deepStrictEqual(
226226
parts.join("?"),
227-
"SELECT t._col1_,t._col2_ FROM table1 t\nORDER BY t._col1_ ASC, t._col2_ DESC"
227+
"SELECT _col1_, _col2_ FROM _table1_\nORDER BY _col1_ ASC, _col2_ DESC"
228228
);
229229
assert.deepStrictEqual(params, []);
230230
});
@@ -235,17 +235,17 @@ describe("makeQueryTemplate", () => {
235235

236236
operations.slice = {from: 10, to: 20};
237237
let [parts, ...params] = makeQueryTemplate(operations, source);
238-
assert.deepStrictEqual(parts.join("?"), "SELECT t.col1,t.col2 FROM table1 t\nLIMIT 10 OFFSET 10");
238+
assert.deepStrictEqual(parts.join("?"), "SELECT col1, col2 FROM table1\nLIMIT 10 OFFSET 10");
239239
assert.deepStrictEqual(params, []);
240240

241241
operations.slice = {from: null, to: 20};
242242
[parts, ...params] = makeQueryTemplate(operations, source);
243-
assert.deepStrictEqual(parts.join("?"), "SELECT t.col1,t.col2 FROM table1 t\nLIMIT 20");
243+
assert.deepStrictEqual(parts.join("?"), "SELECT col1, col2 FROM table1\nLIMIT 20");
244244
assert.deepStrictEqual(params, []);
245245

246246
operations.slice = {from: 10, to: null};
247247
[parts, ...params] = makeQueryTemplate(operations, source);
248-
assert.deepStrictEqual(parts.join("?"), `SELECT t.col1,t.col2 FROM table1 t\nLIMIT ${1e9} OFFSET 10`);
248+
assert.deepStrictEqual(parts.join("?"), `SELECT col1, col2 FROM table1\nLIMIT ${1e9} OFFSET 10`);
249249
assert.deepStrictEqual(params, []);
250250
});
251251

@@ -277,7 +277,7 @@ describe("makeQueryTemplate", () => {
277277
};
278278

279279
const [parts, ...params] = makeQueryTemplate(operations, source);
280-
assert.deepStrictEqual(parts.join("?"), "SELECT t.col1,t.col2,t.col3 FROM table1 t\nWHERE t.col1 >= ?\nAND t.col2 = ?\nORDER BY t.col1 ASC\nLIMIT 90 OFFSET 10");
280+
assert.deepStrictEqual(parts.join("?"), "SELECT col1, col2, col3 FROM table1\nWHERE col1 >= ?\nAND col2 = ?\nORDER BY col1 ASC\nLIMIT 90 OFFSET 10");
281281
assert.deepStrictEqual(params, ["val1", "val2"]);
282282
});
283283

@@ -294,7 +294,7 @@ describe("makeQueryTemplate", () => {
294294
const [parts] = makeQueryTemplate(operations, source);
295295
assert.deepStrictEqual(
296296
parts.join("?"),
297-
"SELECT t._col1_,t._col2_,t._col3_ FROM table1 t\nORDER BY t._col1_ ASC\nOFFSET 0 ROWS\nFETCH NEXT 100 ROWS ONLY"
297+
"SELECT _col1_, _col2_, _col3_ FROM _table1_\nORDER BY _col1_ ASC\nOFFSET 0 ROWS\nFETCH NEXT 100 ROWS ONLY"
298298
);
299299
});
300300

@@ -326,7 +326,7 @@ describe("makeQueryTemplate", () => {
326326
};
327327

328328
const [parts, ...params] = makeQueryTemplate(operations, source);
329-
assert.deepStrictEqual(parts.join("?"), "SELECT t.col1,t.col2,t.col3 FROM table1 t\nWHERE t.col1 >= ?\nAND t.col2 = ?\nORDER BY t.col2 DESC\nOFFSET 10 ROWS\nFETCH NEXT 90 ROWS ONLY");
329+
assert.deepStrictEqual(parts.join("?"), "SELECT col1, col2, col3 FROM table1\nWHERE col1 >= ?\nAND col2 = ?\nORDER BY col2 DESC\nOFFSET 10 ROWS\nFETCH NEXT 90 ROWS ONLY");
330330
assert.deepStrictEqual(params, ["val1", "val2"]);
331331
});
332332

@@ -358,7 +358,7 @@ describe("makeQueryTemplate", () => {
358358
};
359359

360360
const [parts, ...params] = makeQueryTemplate(operations, source);
361-
assert.deepStrictEqual(parts.join("?"), "SELECT * FROM table1 t\nORDER BY t.col2 DESC\nOFFSET 10 ROWS\nFETCH NEXT 90 ROWS ONLY");
361+
assert.deepStrictEqual(parts.join("?"), "SELECT * FROM table1\nORDER BY col2 DESC\nOFFSET 10 ROWS\nFETCH NEXT 90 ROWS ONLY");
362362
assert.deepStrictEqual(params, []);
363363
});
364364
});

0 commit comments

Comments
 (0)