apache · kosiew · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst
@@ -129,3 +129,39 @@ The function :py:func:`~datafusion.functions.in_list` allows to check a column f
           .limit(20)
           .to_pandas()
     )
+
+
+Handling Missing Values 
+=====================
+
+DataFusion provides methods to handle missing values in DataFrames:
+
+fill_null
+---------
+
+The ``fill_null()`` method replaces NULL values in specified columns with a provided value:
+
+.. code-block:: python
+
+    # Fill all NULL values with 0 where possible
+    df = df.fill_null(0)
+
+    # Fill NULL values only in specific string columns
+    df = df.fill_null("missing", subset=["name", "category"]) 
+
+The fill value will be cast to match each column's type. If casting fails for a column, that column remains unchanged.
+
+fill_nan
+--------
+
+The ``fill_nan()`` method replaces NaN values in floating-point columns with a provided numeric value:
+
+.. code-block:: python
+
+    # Fill all NaN values with 0 in numeric columns
+    df = df.fill_nan(0)
+
+    # Fill NaN values in specific numeric columns
+    df = df.fill_nan(99.9, subset=["price", "score"])
+
+This only works on floating-point columns (float32, float64). The fill value must be numeric (int or float).
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -22,6 +22,7 @@
 from __future__ import annotations
 
 import warnings
+from enum import Enum
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -37,6 +38,9 @@
 except ImportError:
     from typing_extensions import deprecated  # Python 3.12
 
+from datafusion import functions as f
+from datafusion._internal import DataFrame as DataFrameInternal
+from datafusion.expr import Expr, SortExpr, sort_or_default
 from datafusion.plan import ExecutionPlan, LogicalPlan
 from datafusion.record_batch import RecordBatchStream
 
@@ -51,9 +55,9 @@
    from datafusion._internal import DataFrame as DataFrameInternal
    from datafusion._internal import expr as expr_internal

 from enum import Enum

 from datafusion.expr import Expr, SortExpr, sort_or_default


 # excerpt from deltalake
@@ -869,3 +873,26 @@
             DataFrame: After applying func to the original dataframe.
         """
         return func(self, *args)
+
+    def fill_null(self, value: Any, subset: list[str] | None = None) -> "DataFrame":
+        """Fill null values in specified columns with a value.
+
+        Args:
+            value: Value to replace nulls with. Will be cast to match column type.
+            subset: Optional list of column names to fill. If None, fills all columns.
+
+        Returns:
+            DataFrame with null values replaced where type casting is possible
+
+        Examples:
+            >>> df = df.fill_null(0)  # Fill all nulls with 0 where possible
+            >>> # Fill nulls in specific string columns
+            >>> df = df.fill_null("missing", subset=["name", "category"])
+
+        Notes:
+            - Only fills nulls in columns where the value can be cast to the column type
+            - For columns where casting fails, the original column is kept unchanged
+            - For columns not in subset, the original column is kept unchanged
+        """
+
+        return DataFrame(self.df.fill_null(value, subset))
diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
@@ -1231,3 +1231,57 @@
 
     actual = df.collect()[0].to_pydict()
     assert actual == expected
+
+
+def test_coalesce(df):
+    # Create a DataFrame with null values
+    ctx = SessionContext()
+    batch = pa.RecordBatch.from_arrays(
+        [
+            pa.array(["Hello", None, "!"]),  # string column with null
+            pa.array([4, None, 6]),  # integer column with null
+            pa.array(["hello ", None, " !"]),  # string column with null
+            pa.array(
+                [datetime(2022, 12, 31), None, datetime(2020, 7, 2)]
+            ),  # datetime with null
+            pa.array([False, None, True]),  # boolean column with null
+        ],
+        names=["a", "b", "c", "d", "e"],
+    )
+    df_with_nulls = ctx.create_dataframe([[batch]])
+
+    # Test coalesce with different data types
+    result_df = df_with_nulls.select(
+        f.coalesce(column("a"), literal("default")).alias("a_coalesced"),
+        f.coalesce(column("b"), literal(0)).alias("b_coalesced"),
+        f.coalesce(column("c"), literal("default")).alias("c_coalesced"),
+        f.coalesce(column("d"), literal(datetime(2000, 1, 1))).alias("d_coalesced"),
+        f.coalesce(column("e"), literal(False)).alias("e_coalesced"),
+    )
+
+    result = result_df.collect()[0]
+
+    # Verify results
+    assert result.column(0) == pa.array(
+        ["Hello", "default", "!"], type=pa.string_view()
+    )
+    assert result.column(1) == pa.array([4, 0, 6], type=pa.int64())
+    assert result.column(2) == pa.array(
+        ["hello ", "default", " !"], type=pa.string_view()
+    )
+    assert result.column(3) == pa.array(
+        [datetime(2022, 12, 31), datetime(2000, 1, 1), datetime(2020, 7, 2)],
+        type=pa.timestamp("us"),
+    )
+    assert result.column(4) == pa.array([False, False, True], type=pa.bool_())
+
+    # Test multiple arguments
+    result_df = df_with_nulls.select(
+        f.coalesce(column("a"), literal(None), literal("fallback")).alias(
+            "multi_coalesce"
+        )
+    )
+    result = result_df.collect()[0]
+    assert result.column(0) == pa.array(
+        ["Hello", "fallback", "!"], type=pa.string_view()
+    )
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -797,6 +797,25 @@ impl PyDataFrame {
     fn count(&self, py: Python) -> PyDataFusionResult<usize> {
         Ok(wait_for_future(py, self.df.as_ref().clone().count())?)
     }
+
+    /// Fill null values with a specified value for specific columns
+    #[pyo3(signature = (value, columns=None))]
+    fn fill_null(
+        &self,
+        value: PyObject,
+        columns: Option<Vec<PyBackedStr>>,
+        py: Python,
+    ) -> PyDataFusionResult<Self> {
+        let scalar_value = python_value_to_scalar_value(&value, py)?;
+
+        let cols = match columns {
+            Some(col_names) => col_names.iter().map(|c| c.to_string()).collect(),
+            None => Vec::new(), // Empty vector means fill null for all columns
+        };
+
+        let df = self.df.as_ref().clone().fill_null(scalar_value, cols)?;
+        Ok(Self::new(df))
+    }
 }
 
 /// Print DataFrame
@@ -951,3 +970,47 @@ async fn collect_record_batches_to_display(
 
     Ok((record_batches, has_more))
 }
+
+/// Convert a Python value to a DataFusion ScalarValue
+fn python_value_to_scalar_value(value: &PyObject, py: Python) -> PyDataFusionResult<ScalarValue> {
+    if value.is_none(py) {
+        return Err(PyDataFusionError::Common(
+            "Cannot use None as fill value".to_string(),
+        ));
+    } else if let Ok(val) = value.extract::<i64>(py) {
+        return Ok(ScalarValue::Int64(Some(val)));
+    } else if let Ok(val) = value.extract::<f64>(py) {
+        return Ok(ScalarValue::Float64(Some(val)));
+    } else if let Ok(val) = value.extract::<bool>(py) {
+        return Ok(ScalarValue::Boolean(Some(val)));
+    } else if let Ok(val) = value.extract::<String>(py) {
+        return Ok(ScalarValue::Utf8(Some(val)));
+    } else if let Ok(dt) = py
+        .import("datetime")
+        .and_then(|m| m.getattr("datetime"))
+        .and_then(|dt| value.is_instance(dt))
+    {
+        if value.is_instance_of::<pyo3::types::PyDateTime>(py) {
+            let naive_dt = value.extract::<chrono::NaiveDateTime>(py)?;
+            return Ok(ScalarValue::TimestampNanosecond(
+                Some(naive_dt.timestamp_nanos()),
+                None,
+            ));
+        } else {
+            return Err(PyDataFusionError::Common(
+                "Unsupported datetime type".to_string(),
+            ));
+        }
+    }
+
+    // Try to convert to string as fallback
+    match value.str(py) {
+        Ok(py_str) => {
+            let s = py_str.to_string()?;
+            Ok(ScalarValue::Utf8(Some(s)))
+        }
+        Err(_) => Err(PyDataFusionError::Common(
+            "Unsupported Python type for fill_null".to_string(),
+        )),
+    }
+}