Skip to content

Add DataFrame fill_nan/fill_null #1019

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
36 changes: 36 additions & 0 deletions docs/source/user-guide/common-operations/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,39 @@ The function :py:func:`~datafusion.functions.in_list` allows to check a column f
.limit(20)
.to_pandas()
)


Handling Missing Values
=====================

DataFusion provides methods to handle missing values in DataFrames:

fill_null
---------

The ``fill_null()`` method replaces NULL values in specified columns with a provided value:

.. code-block:: python

# Fill all NULL values with 0 where possible
df = df.fill_null(0)

# Fill NULL values only in specific string columns
df = df.fill_null("missing", subset=["name", "category"])

The fill value will be cast to match each column's type. If casting fails for a column, that column remains unchanged.

fill_nan
--------

The ``fill_nan()`` method replaces NaN values in floating-point columns with a provided numeric value:

.. code-block:: python

# Fill all NaN values with 0 in numeric columns
df = df.fill_nan(0)

# Fill NaN values in specific numeric columns
df = df.fill_nan(99.9, subset=["price", "score"])

This only works on floating-point columns (float32, float64). The fill value must be numeric (int or float).
27 changes: 27 additions & 0 deletions python/datafusion/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from __future__ import annotations

import warnings
from enum import Enum
from typing import (
TYPE_CHECKING,
Any,
Expand All @@ -37,6 +38,9 @@
except ImportError:
from typing_extensions import deprecated # Python 3.12

from datafusion import functions as f

Check failure on line 41 in python/datafusion/dataframe.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

python/datafusion/dataframe.py:41:37: F401 `datafusion.functions` imported but unused
from datafusion._internal import DataFrame as DataFrameInternal
from datafusion.expr import Expr, SortExpr, sort_or_default
from datafusion.plan import ExecutionPlan, LogicalPlan
from datafusion.record_batch import RecordBatchStream

Expand All @@ -51,9 +55,9 @@
from datafusion._internal import DataFrame as DataFrameInternal
from datafusion._internal import expr as expr_internal

from enum import Enum

Check failure on line 58 in python/datafusion/dataframe.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F811)

python/datafusion/dataframe.py:58:18: F811 Redefinition of unused `Enum` from line 25

from datafusion.expr import Expr, SortExpr, sort_or_default

Check failure on line 60 in python/datafusion/dataframe.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F811)

python/datafusion/dataframe.py:60:29: F811 Redefinition of unused `Expr` from line 43

Check failure on line 60 in python/datafusion/dataframe.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F811)

python/datafusion/dataframe.py:60:35: F811 Redefinition of unused `SortExpr` from line 43

Check failure on line 60 in python/datafusion/dataframe.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F811)

python/datafusion/dataframe.py:60:45: F811 Redefinition of unused `sort_or_default` from line 43


# excerpt from deltalake
Expand Down Expand Up @@ -869,3 +873,26 @@
DataFrame: After applying func to the original dataframe.
"""
return func(self, *args)

def fill_null(self, value: Any, subset: list[str] | None = None) -> "DataFrame":

Check failure on line 877 in python/datafusion/dataframe.py

View workflow job for this annotation

GitHub Actions / build

Ruff (UP037)

python/datafusion/dataframe.py:877:73: UP037 Remove quotes from type annotation
"""Fill null values in specified columns with a value.

Args:
value: Value to replace nulls with. Will be cast to match column type.
subset: Optional list of column names to fill. If None, fills all columns.

Returns:
DataFrame with null values replaced where type casting is possible

Examples:
>>> df = df.fill_null(0) # Fill all nulls with 0 where possible
>>> # Fill nulls in specific string columns
>>> df = df.fill_null("missing", subset=["name", "category"])

Notes:
- Only fills nulls in columns where the value can be cast to the column type
- For columns where casting fails, the original column is kept unchanged
- For columns not in subset, the original column is kept unchanged
"""

Check failure on line 896 in python/datafusion/dataframe.py

View workflow job for this annotation

GitHub Actions / build

Ruff (D202)

python/datafusion/dataframe.py:878:9: D202 No blank lines allowed after function docstring (found 1)

return DataFrame(self.df.fill_null(value, subset))
54 changes: 54 additions & 0 deletions python/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1231,3 +1231,57 @@

actual = df.collect()[0].to_pydict()
assert actual == expected


def test_coalesce(df):
# Create a DataFrame with null values
ctx = SessionContext()
batch = pa.RecordBatch.from_arrays(
[
pa.array(["Hello", None, "!"]), # string column with null
pa.array([4, None, 6]), # integer column with null
pa.array(["hello ", None, " !"]), # string column with null
pa.array(
[datetime(2022, 12, 31), None, datetime(2020, 7, 2)]

Check failure on line 1245 in python/tests/test_functions.py

View workflow job for this annotation

GitHub Actions / build

Ruff (DTZ001)

python/tests/test_functions.py:1245:18: DTZ001 `datetime.datetime()` called without a `tzinfo` argument

Check failure on line 1245 in python/tests/test_functions.py

View workflow job for this annotation

GitHub Actions / build

Ruff (DTZ001)

python/tests/test_functions.py:1245:48: DTZ001 `datetime.datetime()` called without a `tzinfo` argument
), # datetime with null
pa.array([False, None, True]), # boolean column with null
],
names=["a", "b", "c", "d", "e"],
)
df_with_nulls = ctx.create_dataframe([[batch]])

# Test coalesce with different data types
result_df = df_with_nulls.select(
f.coalesce(column("a"), literal("default")).alias("a_coalesced"),
f.coalesce(column("b"), literal(0)).alias("b_coalesced"),
f.coalesce(column("c"), literal("default")).alias("c_coalesced"),
f.coalesce(column("d"), literal(datetime(2000, 1, 1))).alias("d_coalesced"),

Check failure on line 1258 in python/tests/test_functions.py

View workflow job for this annotation

GitHub Actions / build

Ruff (DTZ001)

python/tests/test_functions.py:1258:41: DTZ001 `datetime.datetime()` called without a `tzinfo` argument
f.coalesce(column("e"), literal(False)).alias("e_coalesced"),
)

result = result_df.collect()[0]

# Verify results
assert result.column(0) == pa.array(
["Hello", "default", "!"], type=pa.string_view()
)
assert result.column(1) == pa.array([4, 0, 6], type=pa.int64())
assert result.column(2) == pa.array(
["hello ", "default", " !"], type=pa.string_view()
)
assert result.column(3) == pa.array(
[datetime(2022, 12, 31), datetime(2000, 1, 1), datetime(2020, 7, 2)],
type=pa.timestamp("us"),
)
assert result.column(4) == pa.array([False, False, True], type=pa.bool_())

# Test multiple arguments
result_df = df_with_nulls.select(
f.coalesce(column("a"), literal(None), literal("fallback")).alias(
"multi_coalesce"
)
Comment on lines +1236 to +1282
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could not find tests for coalesce which I used for fill_null, so I added these

)
result = result_df.collect()[0]
assert result.column(0) == pa.array(
["Hello", "fallback", "!"], type=pa.string_view()
)
63 changes: 63 additions & 0 deletions src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,25 @@ impl PyDataFrame {
fn count(&self, py: Python) -> PyDataFusionResult<usize> {
Ok(wait_for_future(py, self.df.as_ref().clone().count())?)
}

/// Fill null values with a specified value for specific columns
#[pyo3(signature = (value, columns=None))]
fn fill_null(
&self,
value: PyObject,
columns: Option<Vec<PyBackedStr>>,
py: Python,
) -> PyDataFusionResult<Self> {
let scalar_value = python_value_to_scalar_value(&value, py)?;

let cols = match columns {
Some(col_names) => col_names.iter().map(|c| c.to_string()).collect(),
None => Vec::new(), // Empty vector means fill null for all columns
};

let df = self.df.as_ref().clone().fill_null(scalar_value, cols)?;
Ok(Self::new(df))
}
}

/// Print DataFrame
Expand Down Expand Up @@ -951,3 +970,47 @@ async fn collect_record_batches_to_display(

Ok((record_batches, has_more))
}

/// Convert a Python value to a DataFusion ScalarValue
fn python_value_to_scalar_value(value: &PyObject, py: Python) -> PyDataFusionResult<ScalarValue> {
if value.is_none(py) {
return Err(PyDataFusionError::Common(
"Cannot use None as fill value".to_string(),
));
} else if let Ok(val) = value.extract::<i64>(py) {
return Ok(ScalarValue::Int64(Some(val)));
} else if let Ok(val) = value.extract::<f64>(py) {
return Ok(ScalarValue::Float64(Some(val)));
} else if let Ok(val) = value.extract::<bool>(py) {
return Ok(ScalarValue::Boolean(Some(val)));
} else if let Ok(val) = value.extract::<String>(py) {
return Ok(ScalarValue::Utf8(Some(val)));
} else if let Ok(dt) = py
.import("datetime")
.and_then(|m| m.getattr("datetime"))
.and_then(|dt| value.is_instance(dt))
{
if value.is_instance_of::<pyo3::types::PyDateTime>(py) {
let naive_dt = value.extract::<chrono::NaiveDateTime>(py)?;
return Ok(ScalarValue::TimestampNanosecond(
Some(naive_dt.timestamp_nanos()),
None,
));
} else {
return Err(PyDataFusionError::Common(
"Unsupported datetime type".to_string(),
));
}
}

// Try to convert to string as fallback
match value.str(py) {
Ok(py_str) => {
let s = py_str.to_string()?;
Ok(ScalarValue::Utf8(Some(s)))
}
Err(_) => Err(PyDataFusionError::Common(
"Unsupported Python type for fill_null".to_string(),
)),
}
}
Loading