From 8cf199ba17d9105d8f1c14cf0fa078f2ba9e1371 Mon Sep 17 00:00:00 2001 From: "Leo x.m. Zeng" Date: Mon, 24 Feb 2025 23:02:09 +0800 Subject: [PATCH] Support the PostgreSQL domain (#532) Domain types are basically user defined aliases of existing types. This starts supporting columns of such domain types pg_duckdb. One interesting thing is that domain types can also contain constraints, e.g. "an int that is less than 10". This doesn't really matter for reading them though, their actual representation is still always the same as the underlying type. It also doesn't matter when doing operations on them, because Postgres already changes a domain type to its underlying representation automatically. So any constraints on the domain don't apply to a result of an operation (e.g. addition). Fixes #514 --- README.md | 2 +- docs/types.md | 3 +- include/pgduckdb/pg/types.hpp | 3 + src/pg/types.cpp | 40 ++++++++++++ src/pgduckdb_types.cpp | 9 ++- test/regression/expected/domain.out | 95 +++++++++++++++++++++++++++++ test/regression/schedule | 1 + test/regression/sql/domain.sql | 60 ++++++++++++++++++ 8 files changed, 208 insertions(+), 5 deletions(-) create mode 100644 test/regression/expected/domain.out create mode 100644 test/regression/sql/domain.sql diff --git a/README.md b/README.md index 8b6fab2e..fcd20bbf 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ pg_duckdb was developed in collaboration with our partners, [Hydra][] and [Mothe See our [official documentation][docs] for further details. - `SELECT` queries executed by the DuckDB engine can directly read Postgres tables. (If you only query Postgres tables you need to run `SET duckdb.force_execution TO true`, see the **IMPORTANT** section above for details) - - Able to read [data types](https://www.postgresql.org/docs/current/datatype.html) that exist in both Postgres and DuckDB. The following data types are supported: numeric, character, binary, date/time, boolean, uuid, json, and arrays. + - Able to read [data types](https://www.postgresql.org/docs/current/datatype.html) that exist in both Postgres and DuckDB. The following data types are supported: numeric, character, binary, date/time, boolean, uuid, json, domain, and arrays. - If DuckDB cannot support the query for any reason, execution falls back to Postgres. - Read and Write support for object storage (AWS S3, Azure, Cloudflare R2, or Google GCS): - Read parquet, CSV and JSON files: diff --git a/docs/types.md b/docs/types.md index d5eac041..3df9bf5f 100644 --- a/docs/types.md +++ b/docs/types.md @@ -11,6 +11,7 @@ Able to read many [data types](https://www.postgresql.org/docs/current/datatype. - `boolean` - `uuid` - `json`/`jsonb` +- `domain` - `arrays` for all of the above types, but see limitations below about multi-dimensional arrays ## Known limitations @@ -31,7 +32,7 @@ to fix these limitations: -- This configures the column to be a 3-dimensional array of text ALTER TABLE s ALTER COLUMN a SET DATA TYPE text[][][]; ``` - +9. For the `domain` actually, during the execution of the INSERT operation, the check regarding `domain` is conducted by PostgreSQL rather than DuckDB. When we execute the SELECT operation and the type of the queried field is a `domain`, we will convert it to the corresponding base type and let DuckDB handle it. ## Special types pg_duckdb introduces a few special Postgres types. You shouldn't create these types explicitly and normally you don't need to know about their existence, but they might show up in error messages from Postgres. These are explained below: diff --git a/include/pgduckdb/pg/types.hpp b/include/pgduckdb/pg/types.hpp index 3024fa04..66a3b464 100644 --- a/include/pgduckdb/pg/types.hpp +++ b/include/pgduckdb/pg/types.hpp @@ -4,4 +4,7 @@ namespace pgduckdb::pg { bool IsArrayType(Oid type_oid); +bool IsDomainType(Oid type_oid); +bool IsArrayDomainType(Oid type_oid); +Oid GetBaseDuckColumnType(Oid attribute_type_oid); } diff --git a/src/pg/types.cpp b/src/pg/types.cpp index d057992c..3fba995f 100644 --- a/src/pg/types.cpp +++ b/src/pg/types.cpp @@ -3,6 +3,9 @@ extern "C" { #include "postgres.h" #include "utils/lsyscache.h" +#include "utils/syscache.h" +#include "catalog/pg_type.h" +#include "executor/tuptable.h" } namespace pgduckdb::pg { @@ -13,4 +16,41 @@ IsArrayType(Oid type_oid) { return PostgresFunctionGuard(get_element_type, type_oid) != InvalidOid; } +bool +IsDomainType(Oid type_oid) { + return PostgresFunctionGuard(get_typtype, type_oid) == TYPTYPE_DOMAIN; +} + +bool +IsArrayDomainType(Oid type_oid) { + bool is_array_domain = false; + if (IsDomainType(type_oid)) { + if (PostgresFunctionGuard(get_base_element_type, type_oid) != InvalidOid) { + is_array_domain = true; + } + } + return is_array_domain; +} + +static Oid +GetBaseDuckColumnType_C(Oid attribute_type_oid) { + Oid typoid = attribute_type_oid; + if (get_typtype(attribute_type_oid) == TYPTYPE_DOMAIN) { + /* It is a domain type that needs to be reduced to its base type */ + typoid = getBaseType(attribute_type_oid); + } else if (type_is_array(attribute_type_oid)) { + Oid eltoid = get_base_element_type(attribute_type_oid); + if (OidIsValid(eltoid) && get_typtype(eltoid) == TYPTYPE_DOMAIN) { + /* When the member type of an array is domain, you need to build a base array type */ + typoid = get_array_type(getBaseType(eltoid)); + } + } + return typoid; +} + +Oid +GetBaseDuckColumnType(Oid attribute_type_oid) { + return PostgresFunctionGuard(GetBaseDuckColumnType_C, attribute_type_oid); +} + } // namespace pgduckdb::pg diff --git a/src/pgduckdb_types.cpp b/src/pgduckdb_types.cpp index 7bc5185f..fb1c1d0f 100644 --- a/src/pgduckdb_types.cpp +++ b/src/pgduckdb_types.cpp @@ -32,6 +32,7 @@ extern "C" { } #include "pgduckdb/pgduckdb_detoast.hpp" +#include "pgduckdb/pgduckdb_process_lock.hpp" namespace pgduckdb { @@ -909,7 +910,8 @@ numeric_typmod_scale(int32 typmod) { duckdb::LogicalType ConvertPostgresToBaseDuckColumnType(Form_pg_attribute &attribute) { - switch (attribute->atttypid) { + Oid typoid = pg::GetBaseDuckColumnType(attribute->atttypid); + switch (typoid) { case BOOLOID: case BOOLARRAYOID: return duckdb::LogicalTypeId::BOOLEAN; @@ -983,9 +985,10 @@ ConvertPostgresToBaseDuckColumnType(Form_pg_attribute &attribute) { duckdb::LogicalType ConvertPostgresToDuckColumnType(Form_pg_attribute &attribute) { auto base_type = ConvertPostgresToBaseDuckColumnType(attribute); - if (!pg::IsArrayType(attribute->atttypid)) { - return base_type; + if (!pg::IsArrayDomainType(attribute->atttypid)) { + return base_type; + } } auto dimensions = attribute->attndims; diff --git a/test/regression/expected/domain.out b/test/regression/expected/domain.out new file mode 100644 index 00000000..96743635 --- /dev/null +++ b/test/regression/expected/domain.out @@ -0,0 +1,95 @@ +create domain domainvarchar varchar(5) check (value is not null); +create domain domainnumeric numeric(8,2) check (value is not null); +create domain domainint4 int4 check (value > 0); +create domain domaintext text check (value is not null); +-- Test tables using domains +create table basictest + ( testint4 domainint4 + , testtext domaintext + , testvarchar domainvarchar + , testnumeric domainnumeric + ); +-- In fact, when we carry out an INSERT operation, the check regarding "domain" is conducted by PostgreSQL rather than DuckDB. +-- When we execute a SELECT operation and the field of the query is "domain", we will convert it to the corresponding base type +-- and let DuckDB handle it. +INSERT INTO basictest values ('88', 'haha', 'short', '123.12'); -- Good +INSERT INTO basictest values ('88', 'haha', 'short text', '123.12'); -- Bad varchar +ERROR: value too long for type character varying(5) +INSERT INTO basictest values ('88', 'haha', 'short', '123.1212'); -- Truncate numeric +-- not support. It will be converted to the following statement +-- SELECT ('-5'::integer)::domainint4 AS domainint4 FROM pgduckdb.xxx.basictest +SELECT (-5)::domainint4 FROM basictest; +WARNING: (PGDuckDB/CreatePlan) Prepared query returned an error: 'Catalog Error: Type with name domainint4 does not exist! +Did you mean "tinyint"? +ERROR: value for domain domainint4 violates check constraint "domainint4_check" +select * from basictest; + testint4 | testtext | testvarchar | testnumeric +----------+----------+-------------+------------- + 88 | haha | short | 123.12 + 88 | haha | short | 123.12 +(2 rows) + +select testtext || testvarchar as concat, testnumeric + 42 as sum +from basictest; + concat | sum +-----------+--------- + hahashort | 165.120 + hahashort | 165.120 +(2 rows) + +select * from basictest where testtext = 'haha'; + testint4 | testtext | testvarchar | testnumeric +----------+----------+-------------+------------- + 88 | haha | short | 123.12 + 88 | haha | short | 123.12 +(2 rows) + +select * from basictest where testvarchar = 'short'; + testint4 | testtext | testvarchar | testnumeric +----------+----------+-------------+------------- + 88 | haha | short | 123.12 + 88 | haha | short | 123.12 +(2 rows) + +-- array_domain +create domain domain_int_array as INT[]; +CREATE TABLE domain_int_array_1d(a domain_int_array); +INSERT INTO domain_int_array_1d SELECT CAST(a as domain_int_array) FROM (VALUES + ('{1, 2, 3}'), + (NULL), + ('{4, 5, NULL, 7}'), + ('{}') +) t(a); +SELECT * FROM domain_int_array_1d; + a +-------------- + {1,2,3} + + {4,5,NULL,7} + {} +(4 rows) + +CREATE TABLE domain_int_array_2d(a domainint4[]); +INSERT INTO domain_int_array_2d SELECT CAST(a as domain_int_array) FROM (VALUES + ('{1, 2, 3}'), + (NULL), + ('{4, 5, NULL, 7}'), + ('{}') +) t(a); +SELECT * FROM domain_int_array_2d; + a +-------------- + {1,2,3} + + {4,5,NULL,7} + {} +(4 rows) + +drop table domain_int_array_2d; +drop table domain_int_array_1d; +drop domain domain_int_array; +drop table basictest; +drop domain domainvarchar restrict; +drop domain domainnumeric restrict; +drop domain domainint4 restrict; +drop domain domaintext; diff --git a/test/regression/schedule b/test/regression/schedule index 136f6e6f..5e5621c2 100644 --- a/test/regression/schedule +++ b/test/regression/schedule @@ -15,6 +15,7 @@ test: hugeint_conversion test: read_functions test: duckdb_only_functions test: duckdb_recycle +test: domain test: cte test: create_schema test: create_table_as diff --git a/test/regression/sql/domain.sql b/test/regression/sql/domain.sql new file mode 100644 index 00000000..41812234 --- /dev/null +++ b/test/regression/sql/domain.sql @@ -0,0 +1,60 @@ +create domain domainvarchar varchar(5) check (value is not null); +create domain domainnumeric numeric(8,2) check (value is not null); +create domain domainint4 int4 check (value > 0); +create domain domaintext text check (value is not null); + +-- Test tables using domains +create table basictest + ( testint4 domainint4 + , testtext domaintext + , testvarchar domainvarchar + , testnumeric domainnumeric + ); + +-- In fact, when we carry out an INSERT operation, the check regarding "domain" is conducted by PostgreSQL rather than DuckDB. +-- When we execute a SELECT operation and the field of the query is "domain", we will convert it to the corresponding base type +-- and let DuckDB handle it. +INSERT INTO basictest values ('88', 'haha', 'short', '123.12'); -- Good +INSERT INTO basictest values ('88', 'haha', 'short text', '123.12'); -- Bad varchar +INSERT INTO basictest values ('88', 'haha', 'short', '123.1212'); -- Truncate numeric + +-- not support. It will be converted to the following statement +-- SELECT ('-5'::integer)::domainint4 AS domainint4 FROM pgduckdb.xxx.basictest +SELECT (-5)::domainint4 FROM basictest; + +select * from basictest; + +select testtext || testvarchar as concat, testnumeric + 42 as sum +from basictest; + +select * from basictest where testtext = 'haha'; +select * from basictest where testvarchar = 'short'; + +-- array_domain +create domain domain_int_array as INT[]; +CREATE TABLE domain_int_array_1d(a domain_int_array); +INSERT INTO domain_int_array_1d SELECT CAST(a as domain_int_array) FROM (VALUES + ('{1, 2, 3}'), + (NULL), + ('{4, 5, NULL, 7}'), + ('{}') +) t(a); +SELECT * FROM domain_int_array_1d; + +CREATE TABLE domain_int_array_2d(a domainint4[]); +INSERT INTO domain_int_array_2d SELECT CAST(a as domain_int_array) FROM (VALUES + ('{1, 2, 3}'), + (NULL), + ('{4, 5, NULL, 7}'), + ('{}') +) t(a); +SELECT * FROM domain_int_array_2d; + +drop table domain_int_array_2d; +drop table domain_int_array_1d; +drop domain domain_int_array; +drop table basictest; +drop domain domainvarchar restrict; +drop domain domainnumeric restrict; +drop domain domainint4 restrict; +drop domain domaintext;