Skip to content

Commit f028571

Browse files
perf: fix pandas PerformanceWarning caused due to frame.insert
1 parent 3a4b466 commit f028571

File tree

1 file changed

+30
-20
lines changed
  • packages/python/plotly/plotly/express

1 file changed

+30
-20
lines changed

packages/python/plotly/plotly/express/_core.py

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,6 @@ def make_trace_kwargs(args, trace_spec, trace_data, mapping_labels, sizeref):
321321
and args["y"]
322322
and len(trace_data[[args["x"], args["y"]]].dropna()) > 1
323323
):
324-
325324
# sorting is bad but trace_specs with "trendline" have no other attrs
326325
sorted_trace_data = trace_data.sort_values(by=args["x"])
327326
y = sorted_trace_data[args["y"]].values
@@ -562,7 +561,6 @@ def set_cartesian_axis_opts(args, axis, letter, orders):
562561

563562

564563
def configure_cartesian_marginal_axes(args, fig, orders):
565-
566564
if "histogram" in [args["marginal_x"], args["marginal_y"]]:
567565
fig.layout["barmode"] = "overlay"
568566

@@ -1064,14 +1062,14 @@ def _escape_col_name(df_input, col_name, extra):
10641062
return col_name
10651063

10661064

1067-
def to_unindexed_series(x):
1065+
def to_unindexed_series(x, name=None):
10681066
"""
10691067
assuming x is list-like or even an existing pd.Series, return a new pd.Series with
10701068
no index, without extracting the data from an existing Series via numpy, which
10711069
seems to mangle datetime columns. Stripping the index from existing pd.Series is
10721070
required to get things to match up right in the new DataFrame we're building
10731071
"""
1074-
return pd.Series(x).reset_index(drop=True)
1072+
return pd.Series(x, name=name).reset_index(drop=True)
10751073

10761074

10771075
def process_args_into_dataframe(args, wide_mode, var_name, value_name):
@@ -1086,9 +1084,12 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
10861084
df_input = args["data_frame"]
10871085
df_provided = df_input is not None
10881086

1089-
df_output = pd.DataFrame()
1090-
constants = dict()
1091-
ranges = list()
1087+
# we use a dict instead of a dataframe directly so that it doesn't cause
1088+
# PerformanceWarning by pandas by repeatedly setting the columns.
1089+
# a dict is used instead of a list as the columns needs to be overwritten.
1090+
df_output = {}
1091+
constants = {}
1092+
ranges = []
10921093
wide_id_vars = set()
10931094
reserved_names = _get_reserved_col_names(args) if df_provided else set()
10941095

@@ -1099,7 +1100,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
10991100
"No data were provided. Please provide data either with the `data_frame` or with the `dimensions` argument."
11001101
)
11011102
else:
1102-
df_output[df_input.columns] = df_input[df_input.columns]
1103+
df_output = {col: series for col, series in df_input.items()}
11031104

11041105
# hover_data is a dict
11051106
hover_data_is_dict = (
@@ -1140,7 +1141,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
11401141
# argument_list and field_list ready, iterate over them
11411142
# Core of the loop starts here
11421143
for i, (argument, field) in enumerate(zip(argument_list, field_list)):
1143-
length = len(df_output)
1144+
length = len(df_output[next(iter(df_output))]) if len(df_output) else 0
11441145
if argument is None:
11451146
continue
11461147
col_name = None
@@ -1181,11 +1182,11 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
11811182
% (
11821183
argument,
11831184
len(real_argument),
1184-
str(list(df_output.columns)),
1185+
str(list(df_output.keys())),
11851186
length,
11861187
)
11871188
)
1188-
df_output[col_name] = to_unindexed_series(real_argument)
1189+
df_output[col_name] = to_unindexed_series(real_argument, col_name)
11891190
elif not df_provided:
11901191
raise ValueError(
11911192
"String or int arguments are only possible when a "
@@ -1214,13 +1215,15 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
12141215
% (
12151216
field,
12161217
len(df_input[argument]),
1217-
str(list(df_output.columns)),
1218+
str(list(df_output.keys())),
12181219
length,
12191220
)
12201221
)
12211222
else:
12221223
col_name = str(argument)
1223-
df_output[col_name] = to_unindexed_series(df_input[argument])
1224+
df_output[col_name] = to_unindexed_series(
1225+
df_input[argument], col_name
1226+
)
12241227
# ----------------- argument is likely a column / array / list.... -------
12251228
else:
12261229
if df_provided and hasattr(argument, "name"):
@@ -1247,9 +1250,9 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
12471250
"All arguments should have the same length. "
12481251
"The length of argument `%s` is %d, whereas the "
12491252
"length of previously-processed arguments %s is %d"
1250-
% (field, len(argument), str(list(df_output.columns)), length)
1253+
% (field, len(argument), str(list(df_output.keys())), length)
12511254
)
1252-
df_output[str(col_name)] = to_unindexed_series(argument)
1255+
df_output[str(col_name)] = to_unindexed_series(argument, str(col_name))
12531256

12541257
# Finally, update argument with column name now that column exists
12551258
assert col_name is not None, (
@@ -1267,12 +1270,19 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
12671270
if field_name != "wide_variable":
12681271
wide_id_vars.add(str(col_name))
12691272

1270-
for col_name in ranges:
1271-
df_output[col_name] = range(len(df_output))
1272-
1273-
for col_name in constants:
1274-
df_output[col_name] = constants[col_name]
1273+
length = len(df_output[next(iter(df_output))]) if len(df_output) else 0
1274+
df_output.update(
1275+
{col_name: pd.Series(range(length), name=col_name) for col_name in ranges}
1276+
)
1277+
df_output.update(
1278+
{
1279+
# constant is single value. repeat by len to avoid creating NaN on concating
1280+
col_name: pd.Series([constants[col_name]] * length, name=col_name)
1281+
for col_name in constants
1282+
}
1283+
)
12751284

1285+
df_output = pd.DataFrame(df_output)
12761286
return df_output, wide_id_vars
12771287

12781288

0 commit comments

Comments
 (0)