Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 61 additions & 43 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/distinct.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,23 @@ package org.jetbrains.kotlinx.dataframe.api
import org.jetbrains.kotlinx.dataframe.AnyColumnReference
import org.jetbrains.kotlinx.dataframe.ColumnsSelector
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.DataRow
import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload
import org.jetbrains.kotlinx.dataframe.annotations.Interpretable
import org.jetbrains.kotlinx.dataframe.annotations.Refine
import org.jetbrains.kotlinx.dataframe.api.DistinctDocs.DESCRIPTION
import org.jetbrains.kotlinx.dataframe.api.DistinctDocs.DISTINCT_PARAM
import org.jetbrains.kotlinx.dataframe.api.DistinctDocs.DISTINCT_RETURN
import org.jetbrains.kotlinx.dataframe.api.DistinctDocs.FUNCTION
import org.jetbrains.kotlinx.dataframe.api.DistinctDocs.PHRASE_ENDING
import org.jetbrains.kotlinx.dataframe.api.Select.SelectSelectingOptions
import org.jetbrains.kotlinx.dataframe.columns.ColumnSet
import org.jetbrains.kotlinx.dataframe.columns.SingleColumn
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls
import org.jetbrains.kotlinx.dataframe.documentation.DslGrammarTemplateColumnsSelectionDsl.DslGrammarTemplate
import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources
import org.jetbrains.kotlinx.dataframe.documentation.Indent
import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns
import org.jetbrains.kotlinx.dataframe.exceptions.DuplicateColumnNamesException
import org.jetbrains.kotlinx.dataframe.impl.columns.DistinctColumnSet
import org.jetbrains.kotlinx.dataframe.indices
Expand All @@ -23,45 +29,60 @@ import kotlin.reflect.KProperty
// region DataFrame

/**
* ## The Distinct Operation
* {@get [DESCRIPTION] Removes duplicated rows based on {@get [PHRASE_ENDING]}}.
Copy link
Collaborator

@Jolanrensen Jolanrensen Dec 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

interesting, I've never used a {@get} inside the "default" section of another {@get} XD but it works :). Did you know you can also write $[PRASE_ENDING] btw? just an alternative notation, does the same.

*
* It removes duplicated rows based on {@get PHRASE_ENDING}.
* The [rows][DataRow] in the resulting [DataFrame] are in the same order
* as they were in the original [DataFrame].
*
* __NOTE:__ The rows in the resulting [DataFrame] are in the same order as they were in the original [DataFrame].
* See also {@get [FUNCTION] [distinctBy] that removes duplicated rows based on the specified columns
* and keeps all the columns in the resulting [DataFrame].}
*
* {@get [DISTINCT_PARAM] @param [columns]
* The names of the columns to consider for evaluating distinct rows.}
* @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention]
*
* See [Selecting Columns][SelectSelectingOptions].
*
* For more information:
*
* @return A new DataFrame containing only distinct rows.
* {@include [DocumentationUrls.Distinct]}
*
* @see [Selecting Columns][SelectSelectingOptions].
* @see {@include [DocumentationUrls.Distinct]}
* {@include [DocumentationUrls.DistinctBy]}
*
* {@get [DISTINCT_PARAM]}
*
* @return {@get [DISTINCT_RETURN] A new [DataFrame] containing only distinct rows.}
*/
@ExcludeFromSources
@Suppress("ClassName")
private interface DistinctDocs {
interface DISTINCT_PARAM

interface DISTINCT_RETURN

interface DESCRIPTION

interface PHRASE_ENDING

interface FUNCTION
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I usually write a tiny comment (no KDoc) next to each parameter, to clarify what they mean

}

/**
* {@include [DistinctDocs]}
* {@set PHRASE_ENDING all columns}.
* {@set [DistinctDocs.DISTINCT_PARAM]}
* {@set [PHRASE_ENDING] all columns}
* {@set [DISTINCT_PARAM]}
*/
public fun <T> DataFrame<T>.distinct(): DataFrame<T> = distinctBy { all() }

/**
* {@include [DistinctDocs]}
Copy link
Collaborator

@Jolanrensen Jolanrensen Dec 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None of the tags in this KDoc need {} around them, as they are not inside another line. It doesn't hurt though, so feel free to write it how you think is best

* {@set PHRASE_ENDING the specified columns}.
* {@set [DESCRIPTION] Selects the specified columns and keeps only distinct rows based on these selected columns}
* {@set [DISTINCT_PARAM] @param [columns\] The [ColumnsSelector] used to select columns
* that will be included in the resulting [DataFrame] and considered for evaluating distinct rows.}
* {@set [DISTINCT_RETURN] A new [DataFrame] containing only selected columns and distinct rows.}
*/
@Refine
@Interpretable("Distinct0")
public fun <T, C> DataFrame<T>.distinct(columns: ColumnsSelector<T, C>): DataFrame<T> = select(columns).distinct()

/**
* {@include [DistinctDocs]}
* {@set PHRASE_ENDING the specified columns}.
*/
@Deprecated(DEPRECATED_ACCESS_API)
@AccessApiOverload
public fun <T> DataFrame<T>.distinct(vararg columns: KProperty<*>): DataFrame<T> =
Expand All @@ -72,46 +93,47 @@ public fun <T> DataFrame<T>.distinct(vararg columns: KProperty<*>): DataFrame<T>

/**
* {@include [DistinctDocs]}
* {@set PHRASE_ENDING the specified columns}.
* {@set [DESCRIPTION] Selects the specified columns and keeps only distinct rows based on these selected columns}
* {@set [DISTINCT_PARAM] @param [columns\] The names of the columns to select
* and to consider for evaluating distinct rows.}
* {@set [DISTINCT_RETURN] A new [DataFrame] containing only selected columns and distinct rows.}
*/
public fun <T> DataFrame<T>.distinct(vararg columns: String): DataFrame<T> = distinct { columns.toColumnSet() }

/**
* {@include [DistinctDocs]}
* {@set PHRASE_ENDING the specified columns}.
*/
@Deprecated(DEPRECATED_ACCESS_API)
@AccessApiOverload
public fun <T> DataFrame<T>.distinct(vararg columns: AnyColumnReference): DataFrame<T> =
distinct { columns.toColumnSet() }

/**
* {@include [DistinctDocs]}
* {@set PHRASE_ENDING the specified columns}.
*/
@Deprecated(DEPRECATED_ACCESS_API)
@AccessApiOverload
public fun <T> DataFrame<T>.distinctBy(vararg columns: KProperty<*>): DataFrame<T> =
distinctBy { columns.toColumnSet() }

/**
* {@include [DistinctDocs]}
* {@set PHRASE_ENDING the specified columns}.
* {@set [PHRASE_ENDING] the specified columns}
* {@set [FUNCTION] [distinct] that selects the specified columns
* (if the columns are not specified, selects all columns)
* and keeps only distinct rows based on these selected columns.}
* {@set [DISTINCT_PARAM] @param [columns\]
* The names of the columns to consider for evaluating distinct rows.}
*/
public fun <T> DataFrame<T>.distinctBy(vararg columns: String): DataFrame<T> = distinctBy { columns.toColumnSet() }

/**
* {@include [DistinctDocs]}
* {@set PHRASE_ENDING the specified columns}.
*/
@Deprecated(DEPRECATED_ACCESS_API)
@AccessApiOverload
public fun <T> DataFrame<T>.distinctBy(vararg columns: AnyColumnReference): DataFrame<T> =
distinctBy { columns.toColumnSet() }

/**
* {@include [DistinctDocs]}
* {@set PHRASE_ENDING the specified columns}.
* {@set [PHRASE_ENDING] the specified columns}
* {@set [FUNCTION] [distinct] that selects the specified columns
* (if the columns are not specified, selects all columns)
* and keeps only distinct rows based on these selected columns.}
* {@set [DISTINCT_PARAM] @param [columns\] The [ColumnsSelector] used to select columns
* that will be considered for evaluating distinct rows.}
*/
public fun <T, C> DataFrame<T>.distinctBy(columns: ColumnsSelector<T, C>): DataFrame<T> {
val cols = get(columns)
Expand All @@ -124,15 +146,13 @@ public fun <T, C> DataFrame<T>.distinctBy(columns: ColumnsSelector<T, C>): DataF
// region ColumnsSelectionDsl

/**
* ##### Distinct {@include [ColumnsSelectionDslLink]}
* Distinct {@include [ColumnsSelectionDslLink]}.
*
* See [Grammar] for all functions in this interface.
*/
public interface DistinctColumnsSelectionDsl {

/**
* ## Distinct Grammar
*
* @include [DslGrammarTemplate]
* {@set [DslGrammarTemplate.DEFINITIONS]
* {@include [DslGrammarTemplate.ColumnSetDef]}
Expand All @@ -152,23 +172,21 @@ public interface DistinctColumnsSelectionDsl {
}

/**
* ## Distinct
* Returns a new [ColumnSet] from [this] [ColumnSet] containing only distinct columns (by path).
* This is useful when you've selected the same column multiple times but only want it once.
*
* NOTE: This doesn't solve [DuplicateColumnNamesException] if you've selected two columns with the same name.
* This doesn't solve [DuplicateColumnNamesException] if you've selected two columns with the same name.
* For this, you'll need to [rename][ColumnsSelectionDsl.named] one of the columns.
*
* ### Check out: [Grammar]
*
* #### For Example:
* `df.`[select][DataFrame.select]` { (`[colsOf][SingleColumn.colsOf]`<`[Int][Int]`>() `[and][ColumnsSelectionDsl.and]` age).`[distinct][ColumnSet.distinct]`() }`
* See also [Grammar], [named][ColumnsSelectionDsl.named], [simplify][ColumnsSelectionDsl.simplify].
*
* `df.`[select][DataFrame.select]` { `[colsAtAnyDepth][ColumnsSelectionDsl.colsAtAnyDepth]`().`[nameStartsWith][ColumnsSelectionDsl.nameStartsWith]`("order").`[distinct][ColumnSet.distinct]`() }`
* ### Examples
* ```kotlin
* df.select { (colsOf<Int>() and age).distinct() }
* df.select { colsAtAnyDepth().nameStartsWith("order").distinct() }
* ```
*
* @return A new [ColumnSet] containing only distinct columns (by path).
* @see ColumnsSelectionDsl.named
* @see ColumnsSelectionDsl.simplify
*/
public fun <C> ColumnSet<C>.distinct(): ColumnSet<C> = DistinctColumnSet(this)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,12 @@ internal interface DocumentationUrls {
/** [See `remove` on the documentation website.]({@include [Url]}/remove.html) */
interface Remove

/** <a href="{@include [Url]}/distinct.html">See `distinct` on the documentation website.</a> */
/** [See `distinct` on the documentation website.]({@include [Url]}/distinct.html) */
interface Distinct

/** [See `distinctBy` on the documentation website.]({@include [Url]}/distinct.html#distinctby) */
interface DistinctBy

/** <a href="{@include [Url]}/flatten.html">See `flatten` on the documentation website.</a> */
interface Flatten

Expand Down