Skip to content

Commit

Permalink
Merge branch 'main' of github.com:Leo-XM-Zeng/pg_duckdb into support_…
Browse files Browse the repository at this point in the history
…domain
  • Loading branch information
Leo-XM-Zeng committed Feb 23, 2025
2 parents f4c21e3 + 05e42f0 commit cc1ebaf
Show file tree
Hide file tree
Showing 14 changed files with 228 additions and 34 deletions.
11 changes: 10 additions & 1 deletion .github/workflows/build_and_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,16 @@ jobs:
- name: Build pg_duckdb extension
id: build
working-directory: duckdb
run: ERROR_ON_WARNING=1 make -j8 install
# For PG17 we build DuckDB as a static library. There's nothing special
# about the static library, nor PG17. This is only done so that we have
# CI coverage for our logic to build a static library version.
run: |
if [ '${{ matrix.version }}' = REL_17_STABLE ]; then
ERROR_ON_WARNING=1 make -j8 install DUCKDB_BUILD=ReleaseStatic
else
ERROR_ON_WARNING=1 make -j8 install
fi
- name: Run make installcheck
id: installcheck
Expand Down
26 changes: 22 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,27 @@ DUCKDB_BUILD_TYPE=
ifeq ($(DUCKDB_BUILD), Debug)
DUCKDB_BUILD_CXX_FLAGS = -g -O0 -D_GLIBCXX_ASSERTIONS
DUCKDB_BUILD_TYPE = debug
DUCKDB_MAKE_TARGET = debug
else ifeq ($(DUCKDB_BUILD), ReleaseStatic)
DUCKDB_BUILD_CXX_FLAGS =
DUCKDB_BUILD_TYPE = release
DUCKDB_MAKE_TARGET = bundle-library
else
DUCKDB_BUILD_CXX_FLAGS =
DUCKDB_BUILD_TYPE = release
DUCKDB_MAKE_TARGET = release
endif

DUCKDB_LIB = libduckdb$(DLSUFFIX)
FULL_DUCKDB_LIB = third_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src/$(DUCKDB_LIB)
PG_DUCKDB_LINK_FLAGS = -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lstdc++ -llz4
DUCKDB_BUILD_DIR = third_party/duckdb/build/$(DUCKDB_BUILD_TYPE)

ifeq ($(DUCKDB_BUILD), ReleaseStatic)
PG_DUCKDB_LINK_FLAGS += third_party/duckdb/build/release/libduckdb_bundle.a
FULL_DUCKDB_LIB = $(DUCKDB_BUILD_DIR)/$(DUCKDB_LIB)
else
PG_DUCKDB_LINK_FLAGS += -lduckdb
FULL_DUCKDB_LIB = $(DUCKDB_BUILD_DIR)/src/$(DUCKDB_LIB)/libduckdb$(DLSUFFIX)
endif

ERROR_ON_WARNING ?=
ifeq ($(ERROR_ON_WARNING), 1)
Expand All @@ -54,7 +68,7 @@ override PG_CXXFLAGS += -std=c++17 ${DUCKDB_BUILD_CXX_FLAGS} ${COMPILER_FLAGS} -
# changes to the vendored code in one place.
override PG_CFLAGS += -Wno-declaration-after-statement

SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb -lstdc++ -llz4
SHLIB_LINK += $(PG_DUCKDB_LINK_FLAGS)

include Makefile.global

Expand Down Expand Up @@ -102,10 +116,14 @@ $(FULL_DUCKDB_LIB): .git/modules/third_party/duckdb/HEAD
DISABLE_ASSERTIONS=$(DUCKDB_DISABLE_ASSERTIONS) \
EXTENSION_CONFIGS="../pg_duckdb_extensions.cmake" \
$(MAKE) -C third_party/duckdb \
$(DUCKDB_BUILD_TYPE)
$(DUCKDB_MAKE_TARGET)

ifeq ($(DUCKDB_BUILD), ReleaseStatic)
install-duckdb: $(FULL_DUCKDB_LIB) $(shlib)
else
install-duckdb: $(FULL_DUCKDB_LIB) $(shlib)
$(install_bin) -m 755 $(FULL_DUCKDB_LIB) $(DESTDIR)$(PG_LIB)
endif

clean-duckdb:
rm -rf third_party/duckdb/build
Expand Down
2 changes: 2 additions & 0 deletions docs/compilation.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ To build and install, run:
make install
```

If you want to link `libduckdb` statically, set `DUCKDB_BUILD=ReleaseStatic` when interacting with `make(1)`.

Add `pg_duckdb` to the `shared_preload_libraries` in your `postgresql.conf` file:

```ini
Expand Down
2 changes: 1 addition & 1 deletion docs/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ Since any connection that uses DuckDB will have its own DuckDB instance, these s

### `duckdb.max_memory` / `duckdb.memory_limit`

The maximum memory DuckDB can use within a single Postgres connection. This is somewhat comparable to Postgres its `work_mem` setting.
The maximum memory DuckDB can use within a single Postgres connection. This is somewhat comparable to Postgres its `work_mem` setting. When set to the empty string, this will use DuckDB its normal default which is 80% of RAM.

Default: `"4GB"`

Expand Down
10 changes: 7 additions & 3 deletions docs/types.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Able to read many [data types](https://www.postgresql.org/docs/current/datatype.
- `uuid`
- `json`/`jsonb`
- `domain`
- `arrays` for all of the above types
- `arrays` for all of the above types, but see limitations below about multi-dimensional arrays

## Known limitations

Expand All @@ -27,8 +27,12 @@ to fix these limitations:
5. `jsonb` columns are converted to `json` columns when reading from DuckDB. This is because DuckDB does not have a `jsonb` type.
6. Many Postgres `json` and `jsonb` functions and operators are not implemented in DuckDB. Instead you can use DuckDB json functions and operators. See the [DuckDB documentation](https://duckdb.org/docs/data/json/json_functions) for more information on these functions.
7. The DuckDB `tinyint` type is converted to a `char` type in Postgres. This is because Postgres does not have a `tinyint` type. This causes it to be displayed as a hex code instead of a regular number.
8. For the `domain` actually, during the execution of the INSERT operation, the check regarding `domain` is conducted by PostgreSQL rather than DuckDB. When we execute the SELECT operation and the type of the queried field is a `domain`, we will convert it to the corresponding base type and let DuckDB handle it.

8. Conversion between in Postgres multi-dimensional arrays and DuckDB nested `LIST`s in DuckDB can run into various problems, because neither database supports the thing that the other supports exactly. Specifically in Postgres it's allowed for different arrays in a column to have a different number of dimensions, e.g. `[1]` and `[[1], [2]]` can both occur in the same column. In DuckDB that's not allowed, i.e. the amount of nesting should always be the same. On the other hand, in DuckDB it's valid for different lists at the same nest-level to contain a different number of elements, e.g. `[[1], [1, 2]`. This is not allowed in Postgres. So conversion between these types is only possible when the arrays follow the subset. Another possible problem that you can run into is that pg_duckdb uses the Postgres column metadata to determine the number of dimensions that an array has. Since Postgres doesn't complain when you add arrays of different dimensions, it's possible that the number of dimensions in the column metadata does not match the actual number of dimensions. To solve this you need to alter the column type:
```sql
-- This configures the column to be a 3-dimensional array of text
ALTER TABLE s ALTER COLUMN a SET DATA TYPE text[][][];
```
9. For the `domain` actually, during the execution of the INSERT operation, the check regarding `domain` is conducted by PostgreSQL rather than DuckDB. When we execute the SELECT operation and the type of the queried field is a `domain`, we will convert it to the corresponding base type and let DuckDB handle it.
## Special types

pg_duckdb introduces a few special Postgres types. You shouldn't create these types explicitly and normally you don't need to know about their existence, but they might show up in error messages from Postgres. These are explained below:
Expand Down
10 changes: 10 additions & 0 deletions include/pgduckdb/pg/types.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#pragma once

#include "pgduckdb/pg/declarations.hpp"

namespace pgduckdb::pg {
bool IsArrayType(Oid type_oid);
bool IsDomainType(Oid type_oid);
bool IsArrayDomainType(Oid type_oid);
Oid GetBaseDuckColumnType(Oid attribute_typoid);
}
52 changes: 52 additions & 0 deletions src/pg/types.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include "pgduckdb/pg/types.hpp"
#include "pgduckdb/pgduckdb_utils.hpp"
extern "C" {
#include "postgres.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"
#include "catalog/pg_type.h"
#include "executor/tuptable.h"
}

namespace pgduckdb::pg {

bool
IsArrayType(Oid type_oid) {
// inlined type_is_array
return PostgresFunctionGuard(get_element_type, type_oid) != InvalidOid;
}

bool
IsDomainType(Oid type_oid) {
return PostgresFunctionGuard(get_typtype, type_oid) == TYPTYPE_DOMAIN;
}

bool
IsArrayDomainType(Oid type_oid) {
bool is_array_domain = false;
if (IsDomainType(type_oid)) {
if (PostgresFunctionGuard(get_base_element_type, type_oid) != InvalidOid) {
is_array_domain = true;
}
}
return is_array_domain;
}

Oid
GetBaseDuckColumnType(Oid attribute_typoid) {
std::lock_guard<std::recursive_mutex> lock(pgduckdb::GlobalProcessLock::GetLock());
Oid typoid = attribute_typoid;
if (get_typtype(attribute_typoid) == TYPTYPE_DOMAIN) {
/* It is a domain type that needs to be reduced to its base type */
typoid = getBaseType(attribute_typoid);
} else if (type_is_array(attribute_typoid)) {
Oid eltoid = get_base_element_type(attribute_typoid);
if (OidIsValid(eltoid) && get_typtype(eltoid) == TYPTYPE_DOMAIN) {
/* When the member type of an array is domain, you need to build a base array type */
typoid = get_array_type(getBaseType(eltoid));
}
}
return typoid;
}

} // namespace pgduckdb::pg
2 changes: 2 additions & 0 deletions src/pgduckdb_ddl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,8 +293,10 @@ CheckOnCommitSupport(OnCommitAction on_commit) {
break;
case ONCOMMIT_DROP:
elog(ERROR, "DuckDB does not support ON COMMIT DROP");
break;
default:
elog(ERROR, "Unsupported ON COMMIT clause: %d", on_commit);
break;
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/pgduckdb_duckdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ DuckDBManager::Initialize() {
SET_DUCKDB_OPTION(autoinstall_known_extensions);
SET_DUCKDB_OPTION(autoload_known_extensions);

if (duckdb_maximum_memory != NULL) {
if (duckdb_maximum_memory != NULL && strlen(duckdb_maximum_memory) == 0) {
config.options.maximum_memory = duckdb::DBConfig::ParseMemoryLimit(duckdb_maximum_memory);
elog(DEBUG2, "[PGDuckDB] Set DuckDB option: 'maximum_memory'=%s", duckdb_maximum_memory);
}
Expand Down
48 changes: 27 additions & 21 deletions src/pgduckdb_types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "pgduckdb/pgduckdb_types.hpp"
#include "pgduckdb/pgduckdb_utils.hpp"
#include "pgduckdb/scan/postgres_scan.hpp"
#include "pgduckdb/pg/types.hpp"

extern "C" {

Expand Down Expand Up @@ -909,18 +910,7 @@ numeric_typmod_scale(int32 typmod) {

duckdb::LogicalType
ConvertPostgresToBaseDuckColumnType(Form_pg_attribute &attribute) {
Oid typoid = attribute->atttypid;
if (get_typtype(attribute->atttypid) == TYPTYPE_DOMAIN) {
/* It is a domain type that needs to be reduced to its base type */
typoid = getBaseType(attribute->atttypid);
} else if (type_is_array(attribute->atttypid)) {
Oid eltoid = get_base_element_type(attribute->atttypid);
if (OidIsValid(eltoid) && get_typtype(eltoid) == TYPTYPE_DOMAIN) {
/* When the member type of an array is domain, you need to build a base array type */
typoid = get_array_type(getBaseType(eltoid));
}
}

Oid typoid = pg::GetBaseDuckColumnType(attribute->atttypid);
switch (typoid) {
case BOOLOID:
case BOOLARRAYOID:
Expand Down Expand Up @@ -994,19 +984,35 @@ ConvertPostgresToBaseDuckColumnType(Form_pg_attribute &attribute) {

duckdb::LogicalType
ConvertPostgresToDuckColumnType(Form_pg_attribute &attribute) {
auto dimensions = attribute->attndims;
if (get_typtype(attribute->atttypid) == TYPTYPE_DOMAIN) {
/* If the domain is an array type, you need to obtain the corresponding array dimension information */
if (type_is_array_domain(attribute->atttypid)) {
HeapTuple typeTuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(attribute->atttypid));
dimensions = ((Form_pg_type)GETSTRUCT(typeTuple))->typndims;
ReleaseSysCache(typeTuple);
auto base_type = ConvertPostgresToBaseDuckColumnType(attribute);
if (!pg::IsArrayType(attribute->atttypid)) {
if (!pg::IsArrayDomainType(attribute->atttypid)) {
return base_type;
}
}

auto base_type = ConvertPostgresToBaseDuckColumnType(attribute);
auto dimensions = attribute->attndims;

/*
* Multi-dimensional arrays in Postgres and nested lists in DuckDB are
* quite different in behaviour. We try to map them to eachother anyway,
* because in a lot of cases that works fine. But there's also quite a few
* where users will get errors.
*
* To support multi-dimensional arrays that are stored in Postgres tables,
* we assume that the attndims value is correct. If people have specified
* the matching number of [] when creating the table, that is the case.
* It's even possible to store arrays of different dimensions in a single
* column. DuckDB does not support that.
*
* In certain cases (such as tables created by a CTAS) attndims can even be
* 0 for array types. It's impossible for us to find out what the actual
* dimensions are without reading the first row. Given that it's most
* to use single-dimensional arrays, we assume that such a column stores
* those.
*/
if (dimensions == 0) {
return base_type;
dimensions = 1;
}

for (int i = 0; i < dimensions; i++) {
Expand Down
3 changes: 2 additions & 1 deletion src/scan/postgres_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,8 @@ PostgresScanFunctionData::~PostgresScanFunctionData() {
//

PostgresScanTableFunction::PostgresScanTableFunction()
: TableFunction("postgres_scan", {}, PostgresScanFunction, nullptr, PostgresScanInitGlobal, PostgresScanInitLocal) {
: TableFunction("pgduckdb_postgres_scan", {}, PostgresScanFunction, nullptr, PostgresScanInitGlobal,
PostgresScanInitLocal) {
named_parameters["cardinality"] = duckdb::LogicalType::UBIGINT;
named_parameters["relid"] = duckdb::LogicalType::UINTEGER;
named_parameters["snapshot"] = duckdb::LogicalType::POINTER;
Expand Down
57 changes: 57 additions & 0 deletions test/regression/expected/array_problems.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
set duckdb.force_execution TO FALSE;
CREATE TABLE s (a text[]);
INSERT INTO s VALUES (ARRAY['abc', 'def', 'ghi']);
-- Because the next table is created using a CTAS, attndims is set to 0. That
-- confused us in the past. See #556 for details. We assume it's single
-- dimensional now.
CREATE TABLE t AS TABLE s;
SELECT * FROM s;
a
---------------
{abc,def,ghi}
(1 row)

SELECT * FROM t;
a
---------------
{abc,def,ghi}
(1 row)

SET duckdb.force_execution TO true;
SELECT * FROM s;
a
---------------
{abc,def,ghi}
(1 row)

SELECT * FROM t;
a
---------------
{abc,def,ghi}
(1 row)

-- Processing arryas of different dimensions in the same column is not something
-- that DuckDB can handle.
INSERT INTO s VALUES(ARRAY[['a', 'b'],['c','d']]);
SELECT * FROM s;
ERROR: (PGDuckDB/Duckdb_ExecCustomScan_Cpp) Invalid Input Error: Dimensionality of the schema and the data does not match, data contains more dimensions than the amount of dimensions specified by the schema
TRUNCATE s;
-- And we assume that the table metadata is correct about the dimensionality.
-- So even if the stored dimensionality is consistently wrong we will throw an
-- error.
INSERT INTO s VALUES(ARRAY[['a', 'b'],['c','d']]);
SELECT * FROM s;
ERROR: (PGDuckDB/Duckdb_ExecCustomScan_Cpp) Invalid Input Error: Dimensionality of the schema and the data does not match, data contains more dimensions than the amount of dimensions specified by the schema
-- But if you change the defintion of the table, we will be able to handle it.
ALTER TABLE s ALTER COLUMN a SET DATA TYPE text[][];
SELECT * FROM s;
a
---------------
{{a,b},{c,d}}
(1 row)

-- Similarly Posgres cannot support nested lists where different sub-lists at
-- the same level have a different length.
SELECT * FROM duckdb.query($$ SELECT ARRAY[ARRAY[1,2], ARRAY[3,4,5]] arr $$);
ERROR: (PGDuckDB/Duckdb_ExecCustomScan_Cpp) Invalid Input Error: Expected 2 values in list at dimension 1, found 3 instead
DROP TABLE s, t;
4 changes: 2 additions & 2 deletions test/regression/expected/duckdb_recycle.out
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ EXPLAIN SELECT count(*) FROM ta;
│ count_star() │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
POSTGRES_SCAN
PGDUCKDB_POSTGRES_SCAN
│ ──────────────────── │
│ Table: ta │
│ │
Expand All @@ -37,7 +37,7 @@ EXPLAIN SELECT count(*) FROM ta;
│ count_star() │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
POSTGRES_SCAN
PGDUCKDB_POSTGRES_SCAN
│ ──────────────────── │
│ Table: ta │
│ │
Expand Down
33 changes: 33 additions & 0 deletions test/regression/sql/array_problems.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
set duckdb.force_execution TO FALSE;
CREATE TABLE s (a text[]);
INSERT INTO s VALUES (ARRAY['abc', 'def', 'ghi']);
-- Because the next table is created using a CTAS, attndims is set to 0. That
-- confused us in the past. See #556 for details. We assume it's single
-- dimensional now.
CREATE TABLE t AS TABLE s;
SELECT * FROM s;
SELECT * FROM t;
SET duckdb.force_execution TO true;
SELECT * FROM s;
SELECT * FROM t;

-- Processing arryas of different dimensions in the same column is not something
-- that DuckDB can handle.
INSERT INTO s VALUES(ARRAY[['a', 'b'],['c','d']]);
SELECT * FROM s;
TRUNCATE s;

-- And we assume that the table metadata is correct about the dimensionality.
-- So even if the stored dimensionality is consistently wrong we will throw an
-- error.
INSERT INTO s VALUES(ARRAY[['a', 'b'],['c','d']]);
SELECT * FROM s;
-- But if you change the defintion of the table, we will be able to handle it.
ALTER TABLE s ALTER COLUMN a SET DATA TYPE text[][];
SELECT * FROM s;

-- Similarly Posgres cannot support nested lists where different sub-lists at
-- the same level have a different length.
SELECT * FROM duckdb.query($$ SELECT ARRAY[ARRAY[1,2], ARRAY[3,4,5]] arr $$);

DROP TABLE s, t;

0 comments on commit cc1ebaf

Please sign in to comment.