Skip to content

HelpersTask727_Add_support_for_openrouter #728

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions dev_scripts_helpers/llms/llm_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

"""
Read input from either stdin or a file, apply a specified transformation using
an LLM, and then write the output to either stdout or a file. It is
particularly useful for integrating with editors like Vim.
an LLM, and then write the output to either stdout or a file. It is particularly
useful for integrating with editors like Vim.

The script `dockerized_llm_transform.py` is executed within a Docker container to ensure
all dependencies are met. The Docker container is built dynamically if
The script `dockerized_llm_transform.py` is executed within a Docker container
to ensure all dependencies are met. The Docker container is built dynamically if
necessary. The script requires an OpenAI API key to be set in the environment.

Examples
Expand Down
113 changes: 113 additions & 0 deletions helpers/hpandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2527,3 +2527,116 @@ def to_perc(vals: Union[List, pd.Series], **perc_kwargs: Dict[str, Any]) -> str:
vals = pd.Series(vals)
ret = hprint.perc(vals.sum(), len(vals), **perc_kwargs)
return ret


# #############################################################################


def convert_to_type(srs: pd.Series, type_: str) -> pd.Series:
"""
Convert the elements to a specific type when possible, and return nan otherwise.

:param srs: column to convert
:param type_: type to convert to
:return: series with converted values or nan
"""
hdbg.dassert_isinstance(srs, pd.Series)
if type_ == "is_bool":
def is_bool(x: Any) -> bool:
# An element is a bool if it's a bool, or a string that can be converted
# to a bool, or an int / float that is 0 or 1.
true_values = ("True", "true", "1", 1, 1.0)
false_values = ("False", "false", "0", 0, 0.0)
if isinstance(x, bool):
ret = x
elif x in true_values:
ret = True
elif x in false_values:
ret = False
else:
ret = np.nan
return ret
ret = srs.map(lambda x: is_bool(x))
elif type_ == "is_int":
ret = pd.to_numeric(srs, errors='coerce')
ret = ret.apply(lambda x: x if pd.notna(x) and float(x).is_integer() else np.nan)
elif type_ == "is_float":
ret = pd.to_numeric(srs, errors='coerce')
elif type_ == "is_string":
ret = srs.astype(str)
else:
# TODO(gp): Add timestamp.
raise ValueError(f"Unknown column type: {type_}")
return ret


def infer_column_types(srs: pd.Series) -> Tuple[Dict[str, float], pd.Series]:
"""
Infer the type of a column by trying to convert it to each type and picking
the one with the highest count, giving priority to stricter types.

:param srs: column to infer the type of
:return: counts and values of the types, e.g.,
```
counts = {
"is_bool": 0.2,
"is_int": 0.3,
"is_float": 0.5,
"is_string": 1.0
}
```
"""
hdbg.dassert_isinstance(srs, pd.Series)
# Convert the column to each type.
vals = {}
types = ["is_bool", "is_int", "is_float", "is_string"]
for type_ in types:
vals[type_] = convert_to_type(srs, type_)
# Count the number of values for each type.
counts = {}
for type_ in types:
counts[type_] = float(vals[type_].notna().mean())
# Choose the type with the highest count, using an order of preference.
if counts["is_bool"] > 0.0 and counts["is_bool"] >= max(counts["is_int"], counts["is_float"], counts["is_string"]):
type_ = "is_bool"
elif counts["is_int"] > 0.0 and counts["is_int"] >= max(counts["is_float"], counts["is_string"]):
type_ = "is_int"
elif counts["is_float"] > 0.0: # and counts["is_float"] >= counts["is_string"]:
type_ = "is_float"
else:
type_ = "is_string"
vals["type"] = type_
#
return counts, vals[type_]


def infer_types_df(df: pd.DataFrame, *, report_invalid_values: bool = False,
log_level: int = logging.INFO,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Infer the type of each column in the DataFrame and return a new DataFrame
with the inferred types.

:param df: DataFrame to infer the types of
:param report_invalid_values: whether to report invalid values
:return: tuple of (DataFrame with the inferred types, DataFrame with the counts)
"""
# Initialize the output DataFrame.
types_df = []
df_out = df.copy()
# Process each column in the DataFrame.
for col_name in df.columns:
_LOG.info("Processing col_name='%s'", col_name)
counts, vals = infer_column_types(df[col_name])
# If requested, print any values that failed conversion.
if report_invalid_values:
mask = vals.isna()
if mask.any():
_LOG.log(log_level, "column=%s\n%s", col_name, df[col_name][mask])
# Store results.
df_out[col_name] = vals
types_df.append(counts)
# Package the counts into a DataFrame.
types_df = pd.DataFrame(types_df)
types_df.index = df.columns
return df_out, types_df
Loading