feat: add fill_nan method to DataFrame for handling NaN values

kosiew · kosiew · commit df6208e1718c · 2025-02-12T16:06:15.000+08:00
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -45,6 +45,7 @@
 
     import pandas as pd
     import polars as pl
+    import pyarrow as pa
 
 from enum import Enum
 
@@ -909,3 +910,57 @@ def fill_null(self, value: Any, subset: list[str] | None = None) -> "DataFrame":
                 exprs.append(f.col(col_name))
 
         return self.select(*exprs)
+    
+    def fill_nan(self, value: float | int, subset: list[str] | None = None) -> "DataFrame":
+        """Fill NaN values in specified numeric columns with a value.
+        
+        Args:
+            value: Numeric value to replace NaN values with
+            subset: Optional list of column names to fill. If None, fills all numeric columns.
+        
+        Returns:
+            DataFrame with NaN values replaced in numeric columns
+        
+        Examples:
+            >>> df = df.fill_nan(0)  # Fill all NaNs with 0 in numeric columns
+            >>> df = df.fill_nan(99.9, subset=["price", "score"])  # Fill specific columns
+        
+        Notes:
+            - Only fills NaN values in numeric columns (float32, float64)
+            - Non-numeric columns are kept unchanged
+            - For columns not in subset, the original column is kept unchanged
+            - Value must be numeric (int or float)
+        """
+        import pyarrow as pa
+        from datafusion import functions as f
+        
+        if not isinstance(value, (int, float)):
+            raise ValueError("Value must be numeric (int or float)")
+
+        # Get columns to process
+        if subset is None:
+            # Only get numeric columns if no subset specified
+            subset = [
+                field.name for field in self.schema() 
+                if pa.types.is_floating(field.type)
+            ]
+        else:
+            schema_cols = self.schema().names
+            for col in subset:
+                if col not in schema_cols:
+                    raise ValueError(f"Column '{col}' not found in DataFrame")
+                if not pa.types.is_floating(self.schema().field(col).type):
+                    raise ValueError(f"Column '{col}' is not a numeric column")
+
+        # Build expressions for select
+        exprs = []
+        for col_name in self.schema().names:
+            if col_name in subset:
+                # Use nanvl function to replace NaN values
+                expr = f.nanvl(f.col(col_name), f.lit(value))
+                exprs.append(expr.alias(col_name))
+            else:
+                # Keep columns not in subset unchanged
+                exprs.append(f.col(col_name))
+                
+        return self.select(*exprs)
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -1264,3 +1264,51 @@ def test_fill_null(df):
     )
     with pytest.raises(ValueError, match="Column 'f' not found in DataFrame"):
         df_with_nulls.fill_null("missing", subset=["e", "f"])
+    
+    def test_fill_nan(df):
+        # Test filling NaNs with integer value
+        df_with_nans = df.with_column("d", literal(float("nan")).cast(pa.float64()))
+        df_filled = df_with_nans.fill_nan(0)
+        result = df_filled.to_pydict()
+        assert result["d"] == [0, 0, 0]
+
+        # Test filling NaNs with float value
+        df_with_nans = df.with_column("d", literal(float("nan")).cast(pa.float64()))
+        df_filled = df_with_nans.fill_nan(99.9)
+        result = df_filled.to_pydict()
+        assert result["d"] == [99.9, 99.9, 99.9]
+
+        # Test filling NaNs with subset of columns
+        df_with_nans = df.with_columns(
+            literal(float("nan")).cast(pa.float64()).alias("d"),
+            literal(float("nan")).cast(pa.float64()).alias("e"),
+        )
+        df_filled = df_with_nans.fill_nan(99.9, subset=["e"])
+        result = df_filled.to_pydict()
+        assert result["d"] == [float("nan"), float("nan"), float("nan")]
+        assert result["e"] == [99.9, 99.9, 99.9]
+
+        # Test filling NaNs with value that cannot be cast to column type
+        df_with_nans = df.with_column("d", literal(float("nan")).cast(pa.float64()))
+        with pytest.raises(ValueError, match="Value must be numeric"):
+            df_with_nans.fill_nan("invalid")
+
+        # Test filling NaNs with subset of columns where some casts fail
+        df_with_nans = df.with_columns(
+            literal(float("nan")).alias("d").cast(pa.float64()),
+            literal(float("nan")).alias("e").cast(pa.float64()),
+        )
+        df_filled = df_with_nans.fill_nan(0, subset=["d", "e"])
+        result = df_filled.to_pydict()
+        assert result["d"] == [0, 0, 0]
+        assert result["e"] == [0, 0, 0]
+
+        # Test filling NaNs with subset of columns where all casts succeed
+        df_with_nans = df.with_columns(
+            literal(float("nan")).alias("d").cast(pa.float64()),
+            literal(float("nan")).alias("e").cast(pa.float64()),
+        )
+        df_filled = df_with_nans.fill_nan(99.9, subset=["e"])
+        result = df_filled.to_pydict()
+        assert result["d"] == [float("nan"), float("nan"), float("nan")]
+        assert result["e"] == [99.9, 99.9, 99.9]