-
Notifications
You must be signed in to change notification settings - Fork 67
feat: add df.bigquery.ai.forecast method to pandas dataframe accessor
#2518
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,7 +12,7 @@ | |
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| from typing import cast | ||
| from typing import cast, Iterable, Optional | ||
|
|
||
| import pandas | ||
| import pandas.api.extensions | ||
|
|
@@ -21,6 +21,85 @@ | |
| import bigframes.pandas as bpd | ||
|
|
||
|
|
||
| class AIAccessor: | ||
| """ | ||
| Pandas DataFrame accessor for BigQuery AI functions. | ||
| """ | ||
|
|
||
| def __init__(self, pandas_obj: pandas.DataFrame): | ||
| self._obj = pandas_obj | ||
|
|
||
| def forecast( | ||
| self, | ||
| *, | ||
| data_col: str, | ||
| timestamp_col: str, | ||
| model: str = "TimesFM 2.0", | ||
| id_cols: Optional[Iterable[str]] = None, | ||
| horizon: int = 10, | ||
| confidence_level: float = 0.95, | ||
| context_window: Optional[int] = None, | ||
| output_historical_time_series: bool = False, | ||
| session=None, | ||
| ) -> pandas.DataFrame: | ||
| """ | ||
| Forecast time series at future horizon using BigQuery AI.FORECAST. | ||
|
|
||
| See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-forecast | ||
|
|
||
| Args: | ||
| data_col (str): | ||
| A str value that specifies the name of the data column. The data column contains the data to forecast. | ||
| The data column must use one of the following data types: INT64, NUMERIC and FLOAT64 | ||
| timestamp_col (str): | ||
| A str value that specified the name of the time points column. | ||
| The time points column provides the time points used to generate the forecast. | ||
| The time points column must use one of the following data types: TIMESTAMP, DATE and DATETIME | ||
| model (str, default "TimesFM 2.0"): | ||
| A str value that specifies the name of the model. "TimesFM 2.0" and "TimesFM 2.5" are supported. | ||
| id_cols (Iterable[str], optional): | ||
| An iterable of str value that specifies the names of one or more ID columns. Each ID identifies a unique time series to forecast. | ||
| Specify one or more values for this argument in order to forecast multiple time series using a single query. | ||
| The columns that you specify must use one of the following data types: STRING, INT64, ARRAY<STRING> and ARRAY<INT64> | ||
| horizon (int, default 10): | ||
| An int value that specifies the number of time points to forecast. The default value is 10. The valid input range is [1, 10,000]. | ||
| confidence_level (float, default 0.95): | ||
| A FLOAT64 value that specifies the percentage of the future values that fall in the prediction interval. | ||
| The default value is 0.95. The valid input range is [0, 1). | ||
| context_window (int, optional): | ||
| An int value that specifies the context window length used by BigQuery ML's built-in TimesFM model. | ||
| The context window length determines how many of the most recent data points from the input time series are use by the model. | ||
| If you don't specify a value, the AI.FORECAST function automatically chooses the smallest possible context window length to use | ||
| that is still large enough to cover the number of time series data points in your input data. | ||
| output_historical_time_series (bool, default False): | ||
| A boolean value that determines whether to include the input time series history in the forecast. | ||
| session (bigframes.session.Session, optional): | ||
| The BigFrames session to use. If not provided, the default global session is used. | ||
|
|
||
| Returns: | ||
| pandas.DataFrame: | ||
| The forecast DataFrame result. | ||
| """ | ||
| import bigframes.bigquery.ai | ||
|
|
||
| if session is None: | ||
| session = bf_session.get_global_session() | ||
|
|
||
| bf_df = cast(bpd.DataFrame, session.read_pandas(self._obj)) | ||
| result = bigframes.bigquery.ai.forecast( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm curious if this triggers our logger. My gut feeling says yes, because our logger only seems to track the call stack of annotated classes / functions. However, I still feel like to double check with you on this.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should trigger the logger, but I think we should probably add labels that are specific to the dataframe accessor. I'll follow-up with that. |
||
| bf_df, | ||
| data_col=data_col, | ||
| timestamp_col=timestamp_col, | ||
| model=model, | ||
| id_cols=id_cols, | ||
| horizon=horizon, | ||
| confidence_level=confidence_level, | ||
| context_window=context_window, | ||
| output_historical_time_series=output_historical_time_series, | ||
| ) | ||
| return result.to_pandas(ordered=True) | ||
|
|
||
|
|
||
| @pandas.api.extensions.register_dataframe_accessor("bigquery") | ||
| class BigQueryDataFrameAccessor: | ||
| """ | ||
|
|
@@ -32,6 +111,13 @@ class BigQueryDataFrameAccessor: | |
| def __init__(self, pandas_obj: pandas.DataFrame): | ||
| self._obj = pandas_obj | ||
|
|
||
| @property | ||
| def ai(self) -> "AIAccessor": | ||
| """ | ||
| Accessor for BigQuery AI functions. | ||
| """ | ||
| return AIAccessor(self._obj) | ||
|
|
||
| def sql_scalar(self, sql_template: str, *, output_dtype=None, session=None): | ||
| """ | ||
| Compute a new pandas Series by applying a SQL scalar function to the DataFrame. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@julesMissing the output_historical_time_series boolean parameter.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added the missing
output_historical_time_seriesboolean parameter and documented it.