Skip to content

Commit 6b39cc6

Browse files
committed
struct parsing for duckdb working!
1 parent 6539830 commit 6b39cc6

File tree

3 files changed

+66
-8
lines changed

3 files changed

+66
-8
lines changed

dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -246,10 +246,16 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) {
246246
// TODO, this should be postponed to post-processing.
247247
// List<AnyRow>.toDataFrame() is heavy!
248248
is ColumnSchema.Group ->
249-
DataColumn.createColumnGroup(
249+
DataColumn.createValueColumn(
250250
name = name,
251-
df = (values as List<AnyRow>).toDataFrame(),
252-
).asDataColumn().cast()
251+
values = values,
252+
infer = if (inferNullability) Infer.Nulls else Infer.None,
253+
type = schema.type,
254+
)
255+
// DataColumn.createColumnGroup(
256+
// name = name,
257+
// df = (values as List<AnyRow>).toDataFrame(),
258+
// ).asDataColumn().cast()
253259

254260
is ColumnSchema.Frame ->
255261
DataColumn.createFrameColumn(

dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,18 @@ import org.duckdb.DuckDBColumnType.UUID
4242
import org.duckdb.DuckDBColumnType.VARCHAR
4343
import org.duckdb.DuckDBResultSetMetaData
4444
import org.duckdb.JsonNode
45+
import org.jetbrains.kotlinx.dataframe.AnyRow
46+
import org.jetbrains.kotlinx.dataframe.DataColumn
4547
import org.jetbrains.kotlinx.dataframe.DataFrame
48+
import org.jetbrains.kotlinx.dataframe.DataRow
49+
import org.jetbrains.kotlinx.dataframe.api.asColumnGroup
50+
import org.jetbrains.kotlinx.dataframe.api.asDataColumn
51+
import org.jetbrains.kotlinx.dataframe.api.castToNotNullable
52+
import org.jetbrains.kotlinx.dataframe.api.first
53+
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
54+
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
55+
import org.jetbrains.kotlinx.dataframe.impl.DataCollector
56+
import org.jetbrains.kotlinx.dataframe.impl.schema.DataFrameSchemaImpl
4657
import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig
4758
import org.jetbrains.kotlinx.dataframe.io.readAllSqlTables
4859
import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema
@@ -56,6 +67,7 @@ import java.sql.ResultSet
5667
import java.sql.Struct
5768
import java.util.Properties
5869
import kotlin.collections.toList
70+
import kotlin.reflect.KClass
5971
import kotlin.reflect.KTypeProjection
6072
import kotlin.reflect.full.createType
6173
import kotlin.reflect.full.withNullability
@@ -100,7 +112,7 @@ public object DuckDb : DbType("duckdb") {
100112
*/
101113
internal fun parseDuckDbType(sqlTypeName: String, isNullable: Boolean): AnyTypeInformation =
102114
duckDbTypeCache.getOrPut(Pair(sqlTypeName, isNullable)) {
103-
when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) {
115+
return@getOrPut when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) {
104116
BOOLEAN -> typeInformationForValueColumnOf<Boolean>(isNullable)
105117

106118
TINYINT -> typeInformationForValueColumnOf<Byte>(isNullable)
@@ -208,9 +220,45 @@ public object DuckDb : DbType("duckdb") {
208220

209221
// TODO requires #1266 for specific types
210222
STRUCT -> {
211-
val structTypes = parseStructType(sqlTypeName)
223+
val structEntries = parseStructType(sqlTypeName)
224+
val parsedStructEntries = structEntries.mapValues { (_, type) ->
225+
parseDuckDbType(sqlTypeName = type, isNullable = true)
226+
}
212227

213-
typeInformationForValueColumnOf<Struct>(isNullable)
228+
val targetSchema = ColumnSchema.Group(
229+
schema = DataFrameSchemaImpl(parsedStructEntries.mapValues { it.value.targetSchema }),
230+
contentType = typeOf<Any?>(),
231+
)
232+
233+
typeInformationWithProcessingFor<Struct, Map<String, Any?>, DataRow<*>>(
234+
jdbcSourceType = typeOf<Struct>().withNullability(isNullable),
235+
targetSchema = targetSchema,
236+
valuePreprocessor = { struct, _ ->
237+
// NOTE DataRows cannot be `null` in DataFrame, instead, all its fields become `null`
238+
if (struct == null) {
239+
parsedStructEntries.mapValues { null }
240+
} else {
241+
// read data from the struct
242+
val attrs = struct.getAttributes(
243+
parsedStructEntries.mapValues {
244+
(it.value.jdbcSourceType.classifier!! as KClass<*>).java
245+
},
246+
)
247+
248+
// and potentially, preprocess each value individually
249+
parsedStructEntries.entries.withIndex().associate { (i, entry) ->
250+
entry.key to entry.value.castToAny().preprocess(attrs[i])
251+
}
252+
}
253+
},
254+
columnPostprocessor = { col, _ ->
255+
col.castToNotNullable()
256+
.values()
257+
.toDataFrame()
258+
.asColumnGroup(col.name())
259+
.asDataColumn()
260+
},
261+
)
214262
}
215263

216264
// Cannot handle this in Kotlin

dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,10 +255,13 @@ class DuckDbTest {
255255
}
256256
}
257257

258+
@DataSchema
259+
data class NestedEntry(val i: Int, val j: String)
260+
258261
@DataSchema
259262
data class NestedTypes(
260263
@ColumnName("ijstruct_col")
261-
val ijstructCol: java.sql.Struct, // TODO
264+
val ijstructCol: NestedEntry, // TODO
262265
@ColumnName("intarray_col")
263266
val intarrayCol: List<Int?>,
264267
@ColumnName("intlist_col")
@@ -646,7 +649,8 @@ class DuckDbTest {
646649
1 to mapOf("value1" to "a", "value2" to "b"),
647650
200 to mapOf("value1" to "c", "value2" to "d"),
648651
)
649-
it[{ "ijstruct_col"<java.sql.Struct>() }].attributes shouldBe arrayOf<Any>(42, "answer")
652+
it[{ "ijstruct_col"["i"]<Int>() }] shouldBe 42
653+
it[{ "ijstruct_col"["j"]<String>() }] shouldBe "answer"
650654
it["union_col"] shouldBe 2
651655
}
652656
}

0 commit comments

Comments
 (0)