Skip to content

Commit a22a2df

Browse files
committed
introduces FastDoubleParser and the useFastDoubleParser parameter in ParserOptions, plus tests
1 parent 4fc5f34 commit a22a2df

File tree

9 files changed

+505
-78
lines changed

9 files changed

+505
-78
lines changed

core/build.gradle.kts

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ dependencies {
7171
implementation(libs.commonsIo)
7272
implementation(libs.serialization.core)
7373
implementation(libs.serialization.json)
74+
implementation(libs.fastDoubleParser)
7475

7576
implementation(libs.fuel)
7677

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt

+21-10
Original file line numberDiff line numberDiff line change
@@ -185,21 +185,32 @@ public fun <T : Any> DataColumn<T>.convertToDouble(): DataColumn<Double> = conve
185185
public fun <T : Any> DataColumn<T?>.convertToDouble(): DataColumn<Double?> = convertTo()
186186

187187
/**
188-
* Parse String column to Double considering locale (number format).
188+
* Parses a String column to Double considering locale (number format).
189189
* If [locale] parameter is defined, it's number format is used for parsing.
190-
* If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used.
190+
* If [locale] parameter is null, the current system locale is used.
191+
* If the column cannot be parsed, then the POSIX format is used.
192+
*
193+
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
191194
*/
192195
@JvmName("convertToDoubleFromString")
193-
public fun DataColumn<String>.convertToDouble(locale: Locale? = null): DataColumn<Double> =
194-
this.castToNullable().convertToDouble(locale).castToNotNullable()
196+
public fun DataColumn<String>.convertToDouble(
197+
locale: Locale? = null,
198+
useFastDoubleParser: Boolean = false,
199+
): DataColumn<Double> = this.castToNullable().convertToDouble(locale, useFastDoubleParser).castToNotNullable()
195200

196201
/**
197-
* Parse String column to Double considering locale (number format).
202+
* Parses a String column to Double considering locale (number format).
198203
* If [locale] parameter is defined, it's number format is used for parsing.
199-
* If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used.
204+
* If [locale] parameter is null, the current system locale is used.
205+
* If the column cannot be parsed, then the POSIX format is used.
206+
*
207+
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
200208
*/
201209
@JvmName("convertToDoubleFromStringNullable")
202-
public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColumn<Double?> {
210+
public fun DataColumn<String?>.convertToDouble(
211+
locale: Locale? = null,
212+
useFastDoubleParser: Boolean = false,
213+
): DataColumn<Double?> {
203214
fun applyParser(parser: (String) -> Double?): DataColumn<Double?> {
204215
var currentRow = 0
205216
try {
@@ -220,14 +231,14 @@ public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColu
220231
}
221232

222233
return if (locale != null) {
223-
val explicitParser = Parsers.getDoubleParser(locale)
234+
val explicitParser = Parsers.getDoubleParser(locale, useFastDoubleParser)
224235
applyParser(explicitParser)
225236
} else {
226237
try {
227-
val defaultParser = Parsers.getDoubleParser()
238+
val defaultParser = Parsers.getDoubleParser(useFastDoubleParser = useFastDoubleParser)
228239
applyParser(defaultParser)
229240
} catch (e: TypeConversionException) {
230-
val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8"))
241+
val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8"), useFastDoubleParser)
231242
applyParser(posixParser)
232243
}
233244
}

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt

+17
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,29 @@ public interface GlobalParserOptions {
4040
public var locale: Locale
4141
}
4242

43+
/**
44+
* ### Options for parsing [String]`?` columns
45+
*
46+
* @param locale locale to use for parsing dates and numbers, defaults to the System default locale.
47+
* If specified instead of [dateTimeFormatter], it will be used in combination with [dateTimePattern]
48+
* to create a [DateTimeFormatter]. Just providing [locale] will not allow you to parse
49+
* locale-specific dates!
50+
* @param dateTimeFormatter a [DateTimeFormatter] to use for parsing dates, if not specified, it will be created
51+
* from [dateTimePattern] and [locale]. If neither [dateTimeFormatter] nor [dateTimePattern] are specified,
52+
* [DateTimeFormatter.ISO_LOCAL_DATE_TIME] will be used.
53+
* @param dateTimePattern a pattern to use for parsing dates. If specified instead of [dateTimeFormatter],
54+
* it will be used to create a [DateTimeFormatter].
55+
* @param nullStrings a set of strings that should be treated as `null` values. By default, it's
56+
* ["null", "NULL", "NA", "N/A"].
57+
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
58+
*/
4359
public data class ParserOptions(
4460
val locale: Locale? = null,
4561
// TODO, migrate to kotlinx.datetime.format.DateTimeFormat? https://github.com/Kotlin/dataframe/issues/876
4662
val dateTimeFormatter: DateTimeFormatter? = null,
4763
val dateTimePattern: String? = null,
4864
val nullStrings: Set<String>? = null,
65+
val useFastDoubleParser: Boolean = false,
4966
) {
5067
internal fun getDateTimeFormatter(): DateTimeFormatter? =
5168
when {

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/utils.kt

+7
Original file line numberDiff line numberDiff line change
@@ -19,24 +19,31 @@ import kotlin.annotation.AnnotationTarget.VALUE_PARAMETER
1919
* {@include [Indent]}
2020
*
2121
*/
22+
@ExcludeFromSources
2223
internal interface LineBreak
2324

2425
/** &nbsp; */
26+
@ExcludeFromSources
2527
internal interface QuarterIndent
2628

2729
/** &nbsp;&nbsp; */
30+
@ExcludeFromSources
2831
internal interface HalfIndent
2932

3033
/** &nbsp;&nbsp;&nbsp;&nbsp; */
34+
@ExcludeFromSources
3135
internal interface Indent
3236

3337
/** &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; */
38+
@ExcludeFromSources
3439
internal interface DoubleIndent
3540

3641
/** &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; */
42+
@ExcludeFromSources
3743
internal interface TripleIndent
3844

3945
/** &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; */
46+
@ExcludeFromSources
4047
internal interface QuadrupleIndent
4148

4249
/**

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt

+10-29
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,13 @@ import org.jetbrains.kotlinx.dataframe.hasNulls
3333
import org.jetbrains.kotlinx.dataframe.impl.canParse
3434
import org.jetbrains.kotlinx.dataframe.impl.catchSilent
3535
import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType
36+
import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser
3637
import org.jetbrains.kotlinx.dataframe.impl.javaDurationCanParse
3738
import org.jetbrains.kotlinx.dataframe.io.isURL
3839
import org.jetbrains.kotlinx.dataframe.io.readJsonStr
3940
import org.jetbrains.kotlinx.dataframe.values
4041
import java.math.BigDecimal
4142
import java.net.URL
42-
import java.text.NumberFormat
4343
import java.text.ParsePosition
4444
import java.time.format.DateTimeFormatter
4545
import java.time.format.DateTimeFormatterBuilder
@@ -274,29 +274,6 @@ internal object Parsers : GlobalParserOptions {
274274
null
275275
}
276276

277-
private fun String.parseDouble(format: NumberFormat) =
278-
when (uppercase(Locale.getDefault())) {
279-
"NAN" -> Double.NaN
280-
281-
"INF" -> Double.POSITIVE_INFINITY
282-
283-
"-INF" -> Double.NEGATIVE_INFINITY
284-
285-
"INFINITY" -> Double.POSITIVE_INFINITY
286-
287-
"-INFINITY" -> Double.NEGATIVE_INFINITY
288-
289-
else -> {
290-
val parsePosition = ParsePosition(0)
291-
val result: Double? = format.parse(this, parsePosition)?.toDouble()
292-
if (parsePosition.index != this.length) {
293-
null
294-
} else {
295-
result
296-
}
297-
}
298-
}
299-
300277
inline fun <reified T : Any> stringParser(
301278
catch: Boolean = false,
302279
coveredBy: Set<KType> = emptySet(),
@@ -316,11 +293,15 @@ internal object Parsers : GlobalParserOptions {
316293
): StringParserWithFormat<T> = StringParserWithFormat(typeOf<T>(), coveredBy, body)
317294

318295
private val parserToDoubleWithOptions = stringParserWithOptions { options ->
319-
val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault())
320-
val parser = { it: String -> it.parseDouble(numberFormat) }
296+
val fastDoubleParser = FastDoubleParser(options ?: ParserOptions())
297+
val parser = { it: String -> fastDoubleParser.parseOrNull(it) }
321298
parser
322299
}
323300

301+
private val posixDoubleParser = FastDoubleParser(
302+
ParserOptions(locale = Locale.forLanguageTag("C.UTF-8")),
303+
)
304+
324305
internal val parsersOrder = listOf(
325306
// Int
326307
stringParser<Int> { it.toIntOrNull() },
@@ -383,7 +364,7 @@ internal object Parsers : GlobalParserOptions {
383364
// Double, with explicit number format or taken from current locale
384365
parserToDoubleWithOptions,
385366
// Double, with POSIX format
386-
stringParser<Double> { it.parseDouble(NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))) },
367+
stringParser<Double> { posixDoubleParser.parseOrNull(it) },
387368
// Boolean
388369
stringParser<Boolean> { it.toBooleanOrNull() },
389370
// BigDecimal
@@ -448,9 +429,9 @@ internal object Parsers : GlobalParserOptions {
448429
return parser.applyOptions(options)
449430
}
450431

451-
internal fun getDoubleParser(locale: Locale? = null): (String) -> Double? {
432+
internal fun getDoubleParser(locale: Locale? = null, useFastDoubleParser: Boolean): (String) -> Double? {
452433
val options = if (locale != null) {
453-
ParserOptions(locale = locale)
434+
ParserOptions(locale = locale, useFastDoubleParser = useFastDoubleParser)
454435
} else {
455436
null
456437
}

0 commit comments

Comments
 (0)