Skip to content

Commit 75d8e78

Browse files
authored
KTNB-693 Send the full dataframe schema as metadata (#706)
KTNB-693 Send full dataframe schema as metadata. * Added a new `types` property to top-level metadata, that contains `kind` and `type` for each column. * Added new `types` and `columns` properties to nested column groups and frames metadata. Similar to the top-level frame. These two changes means it is possible to access column names and types across the entire data frame hiearchy. --------- Co-authored-by: Christian Melchior <[email protected]>
1 parent 58943ab commit 75d8e78

File tree

6 files changed

+294
-32
lines changed

6 files changed

+294
-32
lines changed

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt

+46-10
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ import org.jetbrains.kotlinx.dataframe.ColumnsContainer
99
import org.jetbrains.kotlinx.dataframe.DataColumn
1010
import org.jetbrains.kotlinx.dataframe.api.indices
1111
import org.jetbrains.kotlinx.dataframe.api.isList
12-
import org.jetbrains.kotlinx.dataframe.api.name
1312
import org.jetbrains.kotlinx.dataframe.api.rows
13+
import org.jetbrains.kotlinx.dataframe.api.schema
1414
import org.jetbrains.kotlinx.dataframe.api.take
1515
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
1616
import org.jetbrains.kotlinx.dataframe.columns.ColumnKind
@@ -22,12 +22,16 @@ import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAM
2222
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.METADATA
2323
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL
2424
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW
25+
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.TYPE
26+
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.TYPES
2527
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.VERSION
2628
import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions
2729
import org.jetbrains.kotlinx.dataframe.io.arrayColumnName
2830
import org.jetbrains.kotlinx.dataframe.io.valueColumnName
31+
import org.jetbrains.kotlinx.dataframe.name
2932
import org.jetbrains.kotlinx.dataframe.ncol
3033
import org.jetbrains.kotlinx.dataframe.nrow
34+
import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema
3135
import org.jetbrains.kotlinx.dataframe.typeClass
3236
import java.awt.image.BufferedImage
3337
import java.io.IOException
@@ -53,9 +57,13 @@ internal object SerializationKeys {
5357
const val VERSION = "\$version"
5458
const val COLUMNS = "columns"
5559
const val KOTLIN_DATAFRAME = "kotlin_dataframe"
60+
const val TYPE = "type"
61+
const val TYPES = "types"
5662
}
5763

58-
internal const val SERIALIZATION_VERSION = "2.0.0"
64+
// See docs/serialization_format.md for a description of
65+
// serialization versions and format.
66+
internal const val SERIALIZATION_VERSION = "2.1.0"
5967

6068
internal fun KlaxonJson.encodeRowWithMetadata(
6169
frame: ColumnsContainer<*>,
@@ -65,24 +73,39 @@ internal fun KlaxonJson.encodeRowWithMetadata(
6573
): JsonObject? {
6674
val values = frame.columns().map { col ->
6775
when (col) {
68-
is ColumnGroup<*> -> obj(
69-
DATA to encodeRowWithMetadata(col, index, rowLimit, imageEncodingOptions),
70-
METADATA to obj(KIND to ColumnKind.Group.toString())
71-
)
72-
76+
is ColumnGroup<*> -> {
77+
val schema = col.schema()
78+
obj(
79+
DATA to encodeRowWithMetadata(col, index, rowLimit, imageEncodingOptions),
80+
METADATA to obj(
81+
KIND to ColumnKind.Group.toString(),
82+
COLUMNS to schema.columns.keys,
83+
TYPES to schema.columns.values.map { columnSchema ->
84+
createJsonTypeDescriptor(columnSchema)
85+
}
86+
),
87+
)
88+
}
7389
is FrameColumn<*> -> {
74-
val data = if (rowLimit == null) encodeFrameWithMetadata(col[index], null, imageEncodingOptions)
75-
else encodeFrameWithMetadata(col[index].take(rowLimit), rowLimit, imageEncodingOptions)
90+
val data = if (rowLimit == null) {
91+
encodeFrameWithMetadata(col[index], null, imageEncodingOptions)
92+
} else {
93+
encodeFrameWithMetadata(col[index].take(rowLimit), rowLimit, imageEncodingOptions)
94+
}
95+
val schema = col.schema.value
7696
obj(
7797
DATA to data,
7898
METADATA to obj(
7999
KIND to ColumnKind.Frame.toString(),
100+
COLUMNS to schema.columns.keys,
101+
TYPES to schema.columns.values.map { columnSchema ->
102+
createJsonTypeDescriptor(columnSchema)
103+
},
80104
NCOL to col[index].ncol,
81105
NROW to col[index].nrow
82106
)
83107
)
84108
}
85-
86109
else -> encodeValue(col, index, imageEncodingOptions)
87110
}.let { col.name to it }
88111
}
@@ -148,6 +171,16 @@ private fun encodeBufferedImageAsBase64(
148171
}
149172
}
150173

174+
private fun createJsonTypeDescriptor(columnSchema: ColumnSchema): JsonObject {
175+
return JsonObject(
176+
mutableMapOf(KIND to columnSchema.kind.toString()).also {
177+
if (columnSchema.kind == ColumnKind.Value) {
178+
it.put(TYPE, columnSchema.type.toString())
179+
}
180+
}
181+
)
182+
}
183+
151184
internal fun KlaxonJson.encodeFrameWithMetadata(
152185
frame: AnyFrame,
153186
rowLimit: Int? = null,
@@ -257,6 +290,9 @@ internal fun KlaxonJson.encodeDataFrameWithMetadata(
257290
VERSION to SERIALIZATION_VERSION,
258291
METADATA to obj(
259292
COLUMNS to frame.columnNames(),
293+
TYPES to frame.schema().columns.values.map { colSchema ->
294+
createJsonTypeDescriptor(colSchema)
295+
},
260296
NROW to frame.rowsCount(),
261297
NCOL to frame.columnsCount()
262298
),

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt

+8-4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ package org.jetbrains.kotlinx.dataframe.jupyter
22

33
import com.beust.klaxon.json
44
import org.jetbrains.kotlinx.dataframe.api.take
5+
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.COLUMNS
6+
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAME
7+
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL
8+
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW
59
import org.jetbrains.kotlinx.dataframe.impl.io.encodeFrame
610
import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions
711
import org.jetbrains.kotlinx.dataframe.io.DataFrameHtmlData
@@ -69,10 +73,10 @@ internal inline fun <reified T : Any> JupyterHtmlRenderer.render(
6973
!ideBuildNumber.supportsDynamicNestedTables() -> {
7074
json {
7175
obj(
72-
"nrow" to df.size.nrow,
73-
"ncol" to df.size.ncol,
74-
"columns" to df.columnNames(),
75-
"kotlin_dataframe" to encodeFrame(df.take(limit)),
76+
NROW to df.size.nrow,
77+
NCOL to df.size.ncol,
78+
COLUMNS to df.columnNames(),
79+
KOTLIN_DATAFRAME to encodeFrame(df.take(limit)),
7680
)
7781
}.toJsonString()
7882
}

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt

+46-10
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ import org.jetbrains.kotlinx.dataframe.ColumnsContainer
99
import org.jetbrains.kotlinx.dataframe.DataColumn
1010
import org.jetbrains.kotlinx.dataframe.api.indices
1111
import org.jetbrains.kotlinx.dataframe.api.isList
12-
import org.jetbrains.kotlinx.dataframe.api.name
1312
import org.jetbrains.kotlinx.dataframe.api.rows
13+
import org.jetbrains.kotlinx.dataframe.api.schema
1414
import org.jetbrains.kotlinx.dataframe.api.take
1515
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
1616
import org.jetbrains.kotlinx.dataframe.columns.ColumnKind
@@ -22,12 +22,16 @@ import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAM
2222
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.METADATA
2323
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL
2424
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW
25+
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.TYPE
26+
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.TYPES
2527
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.VERSION
2628
import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions
2729
import org.jetbrains.kotlinx.dataframe.io.arrayColumnName
2830
import org.jetbrains.kotlinx.dataframe.io.valueColumnName
31+
import org.jetbrains.kotlinx.dataframe.name
2932
import org.jetbrains.kotlinx.dataframe.ncol
3033
import org.jetbrains.kotlinx.dataframe.nrow
34+
import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema
3135
import org.jetbrains.kotlinx.dataframe.typeClass
3236
import java.awt.image.BufferedImage
3337
import java.io.IOException
@@ -53,9 +57,13 @@ internal object SerializationKeys {
5357
const val VERSION = "\$version"
5458
const val COLUMNS = "columns"
5559
const val KOTLIN_DATAFRAME = "kotlin_dataframe"
60+
const val TYPE = "type"
61+
const val TYPES = "types"
5662
}
5763

58-
internal const val SERIALIZATION_VERSION = "2.0.0"
64+
// See docs/serialization_format.md for a description of
65+
// serialization versions and format.
66+
internal const val SERIALIZATION_VERSION = "2.1.0"
5967

6068
internal fun KlaxonJson.encodeRowWithMetadata(
6169
frame: ColumnsContainer<*>,
@@ -65,24 +73,39 @@ internal fun KlaxonJson.encodeRowWithMetadata(
6573
): JsonObject? {
6674
val values = frame.columns().map { col ->
6775
when (col) {
68-
is ColumnGroup<*> -> obj(
69-
DATA to encodeRowWithMetadata(col, index, rowLimit, imageEncodingOptions),
70-
METADATA to obj(KIND to ColumnKind.Group.toString())
71-
)
72-
76+
is ColumnGroup<*> -> {
77+
val schema = col.schema()
78+
obj(
79+
DATA to encodeRowWithMetadata(col, index, rowLimit, imageEncodingOptions),
80+
METADATA to obj(
81+
KIND to ColumnKind.Group.toString(),
82+
COLUMNS to schema.columns.keys,
83+
TYPES to schema.columns.values.map { columnSchema ->
84+
createJsonTypeDescriptor(columnSchema)
85+
}
86+
),
87+
)
88+
}
7389
is FrameColumn<*> -> {
74-
val data = if (rowLimit == null) encodeFrameWithMetadata(col[index], null, imageEncodingOptions)
75-
else encodeFrameWithMetadata(col[index].take(rowLimit), rowLimit, imageEncodingOptions)
90+
val data = if (rowLimit == null) {
91+
encodeFrameWithMetadata(col[index], null, imageEncodingOptions)
92+
} else {
93+
encodeFrameWithMetadata(col[index].take(rowLimit), rowLimit, imageEncodingOptions)
94+
}
95+
val schema = col.schema.value
7696
obj(
7797
DATA to data,
7898
METADATA to obj(
7999
KIND to ColumnKind.Frame.toString(),
100+
COLUMNS to schema.columns.keys,
101+
TYPES to schema.columns.values.map { columnSchema ->
102+
createJsonTypeDescriptor(columnSchema)
103+
},
80104
NCOL to col[index].ncol,
81105
NROW to col[index].nrow
82106
)
83107
)
84108
}
85-
86109
else -> encodeValue(col, index, imageEncodingOptions)
87110
}.let { col.name to it }
88111
}
@@ -148,6 +171,16 @@ private fun encodeBufferedImageAsBase64(
148171
}
149172
}
150173

174+
private fun createJsonTypeDescriptor(columnSchema: ColumnSchema): JsonObject {
175+
return JsonObject(
176+
mutableMapOf(KIND to columnSchema.kind.toString()).also {
177+
if (columnSchema.kind == ColumnKind.Value) {
178+
it.put(TYPE, columnSchema.type.toString())
179+
}
180+
}
181+
)
182+
}
183+
151184
internal fun KlaxonJson.encodeFrameWithMetadata(
152185
frame: AnyFrame,
153186
rowLimit: Int? = null,
@@ -257,6 +290,9 @@ internal fun KlaxonJson.encodeDataFrameWithMetadata(
257290
VERSION to SERIALIZATION_VERSION,
258291
METADATA to obj(
259292
COLUMNS to frame.columnNames(),
293+
TYPES to frame.schema().columns.values.map { colSchema ->
294+
createJsonTypeDescriptor(colSchema)
295+
},
260296
NROW to frame.rowsCount(),
261297
NCOL to frame.columnsCount()
262298
),

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt

+9-4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ package org.jetbrains.kotlinx.dataframe.jupyter
22

33
import com.beust.klaxon.json
44
import org.jetbrains.kotlinx.dataframe.api.take
5+
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.COLUMNS
6+
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAME
7+
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL
8+
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW
59
import org.jetbrains.kotlinx.dataframe.impl.io.encodeFrame
610
import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions
711
import org.jetbrains.kotlinx.dataframe.io.DataFrameHtmlData
@@ -65,14 +69,15 @@ internal inline fun <reified T : Any> JupyterHtmlRenderer.render(
6569
if (notebook.kernelVersion >= KotlinKernelVersion.from(MIN_KERNEL_VERSION_FOR_NEW_TABLES_UI)!!) {
6670
val ideBuildNumber = KotlinNotebookPluginUtils.getKotlinNotebookIDEBuildNumber()
6771

72+
// TODO Do we need to handle the improved meta data here as well?
6873
val jsonEncodedDf = when {
6974
!ideBuildNumber.supportsDynamicNestedTables() -> {
7075
json {
7176
obj(
72-
"nrow" to df.size.nrow,
73-
"ncol" to df.size.ncol,
74-
"columns" to df.columnNames(),
75-
"kotlin_dataframe" to encodeFrame(df.take(limit)),
77+
NROW to df.size.nrow,
78+
NCOL to df.size.ncol,
79+
COLUMNS to df.columnNames(),
80+
KOTLIN_DATAFRAME to encodeFrame(df.take(limit)),
7681
)
7782
}.toJsonString()
7883
}

0 commit comments

Comments
 (0)