Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateByOrNull
import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateFor
import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateOf
import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateOfRow
import org.jetbrains.kotlinx.dataframe.impl.columns.ValueColumnInternal
import org.jetbrains.kotlinx.dataframe.impl.columns.toComparableColumns
import org.jetbrains.kotlinx.dataframe.impl.suggestIfNull
import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API
Expand All @@ -33,7 +34,11 @@ public fun <T : Comparable<T>> DataColumn<T?>.max(skipNaN: Boolean = skipNaNDefa
maxOrNull(skipNaN).suggestIfNull("max")

public fun <T : Comparable<T>> DataColumn<T?>.maxOrNull(skipNaN: Boolean = skipNaNDefault): T? =
Aggregators.max<T>(skipNaN).aggregateSingleColumn(this)
if (this is ValueColumnInternal<*>) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not so fond of this solution, as it requires a lot of refactoring in other functions, plus it does not work when you write df.max { myCol }, as I mentioned in #1492 (comment)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead. I'd do this check inside the original aggregateSingleColumn(). Each Aggregator has a name which you could use to query the ValueColumnInternal for the right WrappedStatistic if they are stored in a Map<String, WrappedStatistic>inValueColumnImpl. Though I suppose each Aggregatorwill also need to store any other provided arguments likeskipNaN: Booleanandpercentile: Doublewhen needed... In aMap<String, Any?>` maybe?

That way we could store our "Statistics Cache" in ValueColumnImpl as a

Map<String, Map<Map<String, Any?>, Any?>>

so the result cache could look like:

{
   "max" : {
        { "skipNaN": true } : 312.4
    },
   "min" : {},
    "std" : {
        { "std": 0.9, "skipNaN": false } : Double.NaN,
        { "std": 0.9, "skipNaN": true } : 12.3
    }
}

The challenge may lie in doing this neatly ;P

Copy link
Contributor Author

@CarloMariaProietti CarloMariaProietti Dec 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for reviewing!

Imo using Map<String, Map<Map<String, Any?>, Any?>> introduces a problem,
making a query to this structure does not allow to know if the statistic was computed 'in the past'.
Computing the stat using aggregateSequence implies that the stat can be null, so making a query and getting null does not tell me whether the stat was computed yet.

Maybe it could be Map<String, Map<Map<String, Any?>, WrappedStatistic>>
where WrappedStatistic has two fileds : wasComputed: Boolean and actualStatistic: Any? ?

Aggregators.max<T>(skipNaN).aggregateSingleColumn(this, this.max, skipNaN)
} else {
Aggregators.max<T>(skipNaN).aggregateSingleColumn(this)
}

public inline fun <T, reified R : Comparable<R & Any>?> DataColumn<T>.maxBy(
skipNaN: Boolean = skipNaNDefault,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package org.jetbrains.kotlinx.dataframe.impl.aggregation.aggregators
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.api.asSequence
import org.jetbrains.kotlinx.dataframe.impl.aggregation.aggregators.aggregationHandlers.SelectingAggregationHandler
import org.jetbrains.kotlinx.dataframe.impl.columns.WrappedStatistic
import kotlin.reflect.KType

/**
Expand Down Expand Up @@ -34,6 +35,42 @@ public interface AggregatorAggregationHandler<in Value : Any, out Return : Any?>
valueType = column.type().toValueType(),
)

/**
* optimized override of [aggregateSingleColumn],
* preferred when column's runtime type is ValueColumnInternal so that
* it is possible to exploit cached statistics which are proper of ValueColumnInternal
*/
public fun aggregateSingleColumn(
column: DataColumn<Value?>,
wrappedStatistic: WrappedStatistic,
skipNaN: Boolean,
): Return {
when {
skipNaN && wrappedStatistic.wasComputedSkippingNaN -> {
return wrappedStatistic.statisticComputedSkippingNaN as Return
}

(!skipNaN) && wrappedStatistic.wasComputedNotSkippingNaN -> {
return wrappedStatistic.statisticComputedNotSkippingNaN as Return
}

else -> {
val statistic = aggregateSequence(
values = column.asSequence(),
valueType = column.type().toValueType(),
)
if (skipNaN) {
wrappedStatistic.wasComputedSkippingNaN = true
wrappedStatistic.statisticComputedSkippingNaN = statistic
} else {
wrappedStatistic.wasComputedNotSkippingNaN = true
wrappedStatistic.statisticComputedNotSkippingNaN = statistic
}
return aggregateSingleColumn(column, wrappedStatistic, skipNaN)
}
}
}

/**
* Function that can give the return type of [aggregateSequence] as [KType], given the type of the input.
* This allows aggregators to avoid runtime type calculations.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,26 @@ import org.jetbrains.kotlinx.dataframe.columns.ValueColumn
import kotlin.reflect.KType
import kotlin.reflect.full.withNullability

public class WrappedStatistic(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this class should not be public, should it?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this class should not be public, should it?

Also, I think, if you make the other a var, this can be a data class with var's. It's a bit more kotlin-like :)

public var wasComputedSkippingNaN: Boolean = false,
public var wasComputedNotSkippingNaN: Boolean = false,
public var statisticComputedSkippingNaN: Any? = null,
public var statisticComputedNotSkippingNaN: Any? = null,
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what about std and percentile that take extra arguments?


internal interface ValueColumnInternal<T> : ValueColumn<T> {
val max: WrappedStatistic
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would make this a var and nullable, so we can initialize it to null and don't need to instantiate a class for each statistic when a column is created.

}

internal open class ValueColumnImpl<T>(
values: List<T>,
name: String,
type: KType,
val defaultValue: T? = null,
distinct: Lazy<Set<T>>? = null,
) : DataColumnImpl<T>(values, name, type, distinct),
ValueColumn<T> {
ValueColumn<T>,
ValueColumnInternal<T> {

override fun distinct() = ValueColumnImpl(toSet().toList(), name, type, defaultValue, distinct)

Expand Down Expand Up @@ -48,10 +60,13 @@ internal open class ValueColumnImpl<T>(
override fun defaultValue() = defaultValue

override fun forceResolve() = ResolvingValueColumn(this)

override val max = WrappedStatistic()
}

internal class ResolvingValueColumn<T>(override val source: ValueColumn<T>) :
ValueColumn<T> by source,
ValueColumnInternal<T>,
ForceResolvedColumn<T> {

override fun resolve(context: ColumnResolutionContext) = super<ValueColumn>.resolve(context)
Expand All @@ -70,4 +85,6 @@ internal class ResolvingValueColumn<T>(override val source: ValueColumn<T>) :
override fun equals(other: Any?) = source.checkEquals(other)

override fun hashCode(): Int = source.hashCode()

override val max = WrappedStatistic()
}
Empty file added git
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove this from the commit

Empty file.
Loading