Merge branch 'main' into shuowei-feat-persist-obj-ref

shuoweil · web-flow · commit 91ce83976d55 · 2026-03-18T12:38:48.000-07:00
diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py
@@ -880,6 +880,7 @@ def forecast(
     id_cols: Iterable[str] | None = None,
     horizon: int = 10,
     confidence_level: float = 0.95,
+    output_historical_time_series: bool = False,
     context_window: int | None = None,
 ) -> dataframe.DataFrame:
     """
@@ -914,6 +915,15 @@ def forecast(
         confidence_level (float, default 0.95):
             A FLOAT64 value that specifies the percentage of the future values that fall in the prediction interval.
             The default value is 0.95. The valid input range is [0, 1).
+        output_historical_time_series (bool, default False):
+            A BOOL value that determines whether the input data is returned
+            along with the forecasted data. Set this argument to TRUE to return
+            input data. The default value is FALSE.
+
+            Returning the input data along with the forecasted data lets you
+            compare the historical value of the data column with the forecasted
+            value of the data column, or chart the change in the data column
+            values over time.
         context_window (int, optional):
             An int value that specifies the context window length used by BigQuery ML's built-in TimesFM model.
             The context window length determines how many of the most recent data points from the input time series are use by the model.
@@ -945,6 +955,7 @@ def forecast(
         "timestamp_col": timestamp_col,
         "model": model,
         "horizon": horizon,
+        "output_historical_time_series": output_historical_time_series,
         "confidence_level": confidence_level,
     }
     if id_cols:
diff --git a/bigframes/extensions/pandas/dataframe_accessor.py b/bigframes/extensions/pandas/dataframe_accessor.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import cast
+from typing import cast, Iterable, Optional
 
 import pandas
 import pandas.api.extensions
@@ -21,6 +21,85 @@
 import bigframes.pandas as bpd
 
 
+class AIAccessor:
+    """
+    Pandas DataFrame accessor for BigQuery AI functions.
+    """
+
+    def __init__(self, pandas_obj: pandas.DataFrame):
+        self._obj = pandas_obj
+
+    def forecast(
+        self,
+        *,
+        data_col: str,
+        timestamp_col: str,
+        model: str = "TimesFM 2.0",
+        id_cols: Optional[Iterable[str]] = None,
+        horizon: int = 10,
+        confidence_level: float = 0.95,
+        context_window: Optional[int] = None,
+        output_historical_time_series: bool = False,
+        session=None,
+    ) -> pandas.DataFrame:
+        """
+        Forecast time series at future horizon using BigQuery AI.FORECAST.
+
+        See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-forecast
+
+        Args:
+            data_col (str):
+                A str value that specifies the name of the data column. The data column contains the data to forecast.
+                The data column must use one of the following data types: INT64, NUMERIC and FLOAT64
+            timestamp_col (str):
+                A str value that specified the name of the time points column.
+                The time points column provides the time points used to generate the forecast.
+                The time points column must use one of the following data types: TIMESTAMP, DATE and DATETIME
+            model (str, default "TimesFM 2.0"):
+                A str value that specifies the name of the model. "TimesFM 2.0" and "TimesFM 2.5" are supported.
+            id_cols (Iterable[str], optional):
+                An iterable of str value that specifies the names of one or more ID columns. Each ID identifies a unique time series to forecast.
+                Specify one or more values for this argument in order to forecast multiple time series using a single query.
+                The columns that you specify must use one of the following data types: STRING, INT64, ARRAY<STRING> and ARRAY<INT64>
+            horizon (int, default 10):
+                An int value that specifies the number of time points to forecast. The default value is 10. The valid input range is [1, 10,000].
+            confidence_level (float, default 0.95):
+                A FLOAT64 value that specifies the percentage of the future values that fall in the prediction interval.
+                The default value is 0.95. The valid input range is [0, 1).
+            context_window (int, optional):
+                An int value that specifies the context window length used by BigQuery ML's built-in TimesFM model.
+                The context window length determines how many of the most recent data points from the input time series are use by the model.
+                If you don't specify a value, the AI.FORECAST function automatically chooses the smallest possible context window length to use
+                that is still large enough to cover the number of time series data points in your input data.
+            output_historical_time_series (bool, default False):
+                A boolean value that determines whether to include the input time series history in the forecast.
+            session (bigframes.session.Session, optional):
+                The BigFrames session to use. If not provided, the default global session is used.
+
+        Returns:
+            pandas.DataFrame:
+                The forecast DataFrame result.
+        """
+        import bigframes.bigquery.ai
+
+        if session is None:
+            session = bf_session.get_global_session()
+
+        bf_df = cast(bpd.DataFrame, session.read_pandas(self._obj))
+        result = bigframes.bigquery.ai.forecast(
+            bf_df,
+            data_col=data_col,
+            timestamp_col=timestamp_col,
+            model=model,
+            id_cols=id_cols,
+            horizon=horizon,
+            confidence_level=confidence_level,
+            context_window=context_window,
+            output_historical_time_series=output_historical_time_series,
+        )
+        return result.to_pandas(ordered=True)
+
+
 @pandas.api.extensions.register_dataframe_accessor("bigquery")
 class BigQueryDataFrameAccessor:
     """
@@ -32,6 +111,13 @@ class BigQueryDataFrameAccessor:
     def __init__(self, pandas_obj: pandas.DataFrame):
         self._obj = pandas_obj
 
+    @property
+    def ai(self) -> "AIAccessor":
+        """
+        Accessor for BigQuery AI functions.
+        """
+        return AIAccessor(self._obj)
+
     def sql_scalar(self, sql_template: str, *, output_dtype=None, session=None):
         """
         Compute a new pandas Series by applying a SQL scalar function to the DataFrame.
diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py
@@ -88,6 +88,7 @@ def test_arima_plus_model_fit_score(
         result,
         columns=expected_columns,
         index=2 if id_col_name else 1,
+        col_exact=False,
     )
 
     # save, load to ensure configuration was kept
diff --git a/tests/system/large/ml/test_llm.py b/tests/system/large/ml/test_llm.py
@@ -198,6 +198,7 @@ def test_llm_gemini_score(llm_fine_tune_df_default_index, model_name):
             "evaluation_status",
         ],
         index=1,
+        col_exact=False,
     )
 
 
@@ -226,6 +227,7 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index, model_name)
             "label",
             "evaluation_status",
         ],
+        col_exact=False,
     )
 
 
diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py
@@ -493,7 +493,7 @@ def test_arima_plus_score(
             dtype="Float64",
         )
     pd.testing.assert_frame_equal(
-        result,
+        result[expected.columns],
         expected,
         rtol=0.1,
         check_index_type=False,
@@ -594,7 +594,7 @@ def test_arima_plus_score_series(
             dtype="Float64",
         )
     pd.testing.assert_frame_equal(
-        result,
+        result[expected.columns],
         expected,
         rtol=0.1,
         check_index_type=False,
diff --git a/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py b/tests/unit/core/compile/sqlglot/test_dataframe_accessor.py
@@ -43,3 +43,42 @@ def to_pandas(series, *, ordered):
 
     session.read_pandas.assert_called_once()
     snapshot.assert_match(result, "out.sql")
+
+
+def test_ai_forecast(snapshot, monkeypatch):
+    import bigframes.bigquery.ai
+    import bigframes.session
+
+    session = mock.create_autospec(bigframes.session.Session)
+    bf_df = mock.create_autospec(bpd.DataFrame)
+    session.read_pandas.return_value = bf_df
+
+    def mock_ai_forecast(df, **kwargs):
+        assert df is bf_df
+        result_df = mock.create_autospec(bpd.DataFrame)
+        result_df.to_pandas.return_value = kwargs
+        return result_df
+
+    import bigframes.bigquery.ai
+
+    monkeypatch.setattr(bigframes.bigquery.ai, "forecast", mock_ai_forecast)
+
+    df = pd.DataFrame({"date": ["2020-01-01"], "value": [1.0]})
+    result = df.bigquery.ai.forecast(
+        timestamp_col="date",
+        data_col="value",
+        horizon=5,
+        session=session,
+    )
+
+    session.read_pandas.assert_called_once()
+    assert result == {
+        "timestamp_col": "date",
+        "data_col": "value",
+        "model": "TimesFM 2.0",
+        "id_cols": None,
+        "horizon": 5,
+        "confidence_level": 0.95,
+        "context_window": None,
+        "output_historical_time_series": False,
+    }

Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,7 @@ def test_arima_plus_model_fit_score(`
`88`	`88`	`result,`
`89`	`89`	`columns=expected_columns,`
`90`	`90`	`index=2 if id_col_name else 1,`
	`91`	`+ col_exact=False,`
`91`	`92`	`)`
`92`	`93`
`93`	`94`	`# save, load to ensure configuration was kept`
Original file line number	Diff line number	Diff line change
`@@ -198,6 +198,7 @@ def test_llm_gemini_score(llm_fine_tune_df_default_index, model_name):`
`198`	`198`	`"evaluation_status",`
`199`	`199`	`],`
`200`	`200`	`index=1,`
	`201`	`+ col_exact=False,`
`201`	`202`	`)`
`202`	`203`
`203`	`204`
`@@ -226,6 +227,7 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index, model_name)`
`226`	`227`	`"label",`
`227`	`228`	`"evaluation_status",`
`228`	`229`	`],`
	`230`	`+ col_exact=False,`
`229`	`231`	`)`
`230`	`232`
`231`	`233`