Skip to content

Commit de64391

Browse files
Merge branch 'main' into frame_1
2 parents abb8a5f + 9ee361b commit de64391

File tree

27 files changed

+253
-69
lines changed

27 files changed

+253
-69
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,7 @@ Other API changes
820820
:meth:`~DataFrame.ffill`, :meth:`~DataFrame.bfill`, :meth:`~DataFrame.interpolate`,
821821
:meth:`~DataFrame.where`, :meth:`~DataFrame.mask`, :meth:`~DataFrame.clip`) now return
822822
the modified DataFrame or Series (``self``) instead of ``None`` when ``inplace=True`` (:issue:`63207`)
823+
- All Index constructors now copy ``numpy.ndarray`` and ``ExtensionArray`` inputs by default when ``copy=None``, consistent with :class:`Series` behavior (:issue:`63388`)
823824

824825
.. ---------------------------------------------------------------------------
825826
.. _whatsnew_300.deprecations:

pandas/core/algorithms.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -898,18 +898,15 @@ def value_counts_internal(
898898
result = result.iloc[0:0]
899899

900900
# normalizing is by len of all (regardless of dropna)
901-
counts = np.array([len(ii)])
901+
normalize_denominator = len(ii)
902902

903903
else:
904+
normalize_denominator = None
904905
if is_extension_array_dtype(values):
905906
# handle Categorical and sparse,
906907
result = Series(values, copy=False)._values.value_counts(dropna=dropna)
907908
result.name = name
908909
result.index.name = index_name
909-
counts = result._values
910-
if not isinstance(counts, np.ndarray):
911-
# e.g. ArrowExtensionArray
912-
counts = np.asarray(counts)
913910

914911
elif isinstance(values, ABCMultiIndex):
915912
# GH49558
@@ -920,10 +917,6 @@ def value_counts_internal(
920917
.size()
921918
)
922919
result.index.names = values.names
923-
# error: Incompatible types in assignment (expression has type
924-
# "ndarray[Any, Any] | DatetimeArray | TimedeltaArray | PeriodArray | Any",
925-
# variable has type "ndarray[tuple[int, ...], dtype[Any]]")
926-
counts = result._values # type: ignore[assignment]
927920

928921
else:
929922
values = _ensure_arraylike(values, func_name="value_counts")
@@ -936,8 +929,7 @@ def value_counts_internal(
936929
idx = Index(keys, dtype=keys.dtype, name=index_name)
937930

938931
if (
939-
bins is None
940-
and not sort
932+
not sort
941933
and isinstance(values, (DatetimeIndex, TimedeltaIndex))
942934
and idx.equals(values)
943935
and values.inferred_freq is not None
@@ -951,7 +943,10 @@ def value_counts_internal(
951943
result = result.sort_values(ascending=ascending, kind="stable")
952944

953945
if normalize:
954-
result = result / counts.sum()
946+
if normalize_denominator is not None:
947+
result = result / normalize_denominator
948+
else:
949+
result = result / result.sum()
955950

956951
return result
957952

pandas/core/arrays/datetimes.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,13 +226,16 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):
226226
"""
227227

228228
_typ = "datetimearray"
229-
_internal_fill_value = np.datetime64("NaT", "ns")
230229
_recognized_scalars = (datetime, np.datetime64)
231230
_is_recognized_dtype: Callable[[DtypeObj], bool] = lambda x: lib.is_np_dtype(
232231
x, "M"
233232
) or isinstance(x, DatetimeTZDtype)
234233
_infer_matches = ("datetime", "datetime64", "date")
235234

235+
@property
236+
def _internal_fill_value(self) -> np.datetime64:
237+
return np.datetime64("NaT", self.unit)
238+
236239
@property
237240
def _scalar_type(self) -> type[Timestamp]:
238241
return Timestamp

pandas/core/arrays/timedeltas.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,14 @@ class TimedeltaArray(dtl.TimelikeOps):
154154
"""
155155

156156
_typ = "timedeltaarray"
157-
_internal_fill_value = np.timedelta64("NaT", "ns")
158157
_recognized_scalars = (timedelta, np.timedelta64, Tick)
159158
_is_recognized_dtype: Callable[[DtypeObj], bool] = lambda x: lib.is_np_dtype(x, "m")
160159
_infer_matches = ("timedelta", "timedelta64")
161160

161+
@property
162+
def _internal_fill_value(self) -> np.timedelta64:
163+
return np.timedelta64("NaT", self.unit)
164+
162165
@property
163166
def _scalar_type(self) -> type[Timedelta]:
164167
return Timedelta

pandas/core/indexes/base.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -505,12 +505,8 @@ def __new__(
505505
if not copy and isinstance(data, (ABCSeries, Index)):
506506
refs = data._references
507507

508-
if isinstance(data, (ExtensionArray, np.ndarray)):
509-
# GH 63306
510-
if copy is not False:
511-
if dtype is None or astype_is_view(data.dtype, dtype):
512-
data = data.copy()
513-
copy = False
508+
# GH 63306, GH 63388
509+
data, copy = cls._maybe_copy_array_input(data, copy, dtype)
514510

515511
# range
516512
if isinstance(data, (range, RangeIndex)):
@@ -5197,6 +5193,21 @@ def _raise_scalar_data_error(cls, data):
51975193
"was passed"
51985194
)
51995195

5196+
@classmethod
5197+
def _maybe_copy_array_input(
5198+
cls, data, copy: bool | None, dtype
5199+
) -> tuple[Any, bool]:
5200+
"""
5201+
Ensure that the input data is copied if necessary.
5202+
GH#63388
5203+
"""
5204+
if isinstance(data, (ExtensionArray, np.ndarray)):
5205+
if copy is not False:
5206+
if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
5207+
data = data.copy()
5208+
copy = False
5209+
return data, bool(copy)
5210+
52005211
def _validate_fill_value(self, value):
52015212
"""
52025213
Check if the value can be inserted into our array without casting,
@@ -6865,12 +6876,15 @@ def get_slice_bound(self, label, side: Literal["left", "right"]) -> int:
68656876
# we need to look up the label
68666877
try:
68676878
slc = self.get_loc(label)
6868-
except KeyError as err:
6879+
except KeyError:
68696880
try:
68706881
return self._searchsorted_monotonic(label, side)
68716882
except ValueError:
6872-
# raise the original KeyError
6873-
raise err from None
6883+
raise KeyError(
6884+
f"Cannot get {side} slice bound for non-monotonic index "
6885+
f"with a missing label {original_label!r}. "
6886+
"Either sort the index or specify an existing label."
6887+
) from None
68746888

68756889
if isinstance(slc, np.ndarray):
68766890
# get_loc may return a boolean array, which

pandas/core/indexes/datetimes.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,13 @@ class DatetimeIndex(DatetimeTimedeltaMixin):
181181
If True parse dates in `data` with the year first order.
182182
dtype : numpy.dtype or DatetimeTZDtype or str, default None
183183
Note that the only NumPy dtype allowed is `datetime64[ns]`.
184-
copy : bool, default False
185-
Make a copy of input ndarray.
184+
copy : bool, default None
185+
Whether to copy input data, only relevant for array, Series, and Index
186+
inputs (for other input, e.g. a list, a new array is created anyway).
187+
Defaults to True for array input and False for Index/Series.
188+
Set to False to avoid copying array input at your own risk (if you
189+
know the input data won't be modified elsewhere).
190+
Set to True to force copying Series/Index up front.
186191
name : label, default None
187192
Name to be stored in the index.
188193
@@ -669,7 +674,7 @@ def __new__(
669674
dayfirst: bool = False,
670675
yearfirst: bool = False,
671676
dtype: Dtype | None = None,
672-
copy: bool = False,
677+
copy: bool | None = None,
673678
name: Hashable | None = None,
674679
) -> Self:
675680
if is_scalar(data):
@@ -679,6 +684,9 @@ def __new__(
679684

680685
name = maybe_extract_name(name, data, cls)
681686

687+
# GH#63388
688+
data, copy = cls._maybe_copy_array_input(data, copy, dtype)
689+
682690
if (
683691
isinstance(data, DatetimeArray)
684692
and freq is lib.no_default

pandas/core/indexes/interval.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,13 @@ class IntervalIndex(ExtensionIndex):
169169
neither.
170170
dtype : dtype or None, default None
171171
If None, dtype will be inferred.
172-
copy : bool, default False
173-
Copy the input data.
172+
copy : bool, default None
173+
Whether to copy input data, only relevant for array, Series, and Index
174+
inputs (for other input, e.g. a list, a new array is created anyway).
175+
Defaults to True for array input and False for Index/Series.
176+
Set to False to avoid copying array input at your own risk (if you
177+
know the input data won't be modified elsewhere).
178+
Set to True to force copying Series/Index input up front.
174179
name : object, optional
175180
Name to be stored in the index.
176181
verify_integrity : bool, default True
@@ -252,12 +257,15 @@ def __new__(
252257
data,
253258
closed: IntervalClosedType | None = None,
254259
dtype: Dtype | None = None,
255-
copy: bool = False,
260+
copy: bool | None = None,
256261
name: Hashable | None = None,
257262
verify_integrity: bool = True,
258263
) -> Self:
259264
name = maybe_extract_name(name, data, cls)
260265

266+
# GH#63388
267+
data, copy = cls._maybe_copy_array_input(data, copy, dtype)
268+
261269
with rewrite_exception("IntervalArray", cls.__name__):
262270
array = IntervalArray(
263271
data,

pandas/core/indexes/period.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,13 @@ class PeriodIndex(DatetimeIndexOpsMixin):
101101
One of pandas period strings or corresponding objects.
102102
dtype : str or PeriodDtype, default None
103103
A dtype from which to extract a freq.
104-
copy : bool
105-
Make a copy of input ndarray.
104+
copy : bool, default None
105+
Whether to copy input data, only relevant for array, Series, and Index
106+
inputs (for other input, e.g. a list, a new array is created anyway).
107+
Defaults to True for array input and False for Index/Series.
108+
Set to False to avoid copying array input at your own risk (if you
109+
know the input data won't be modified elsewhere).
110+
Set to True to force copying Series/Index input up front.
106111
name : str, default None
107112
Name of the resulting PeriodIndex.
108113
@@ -220,7 +225,7 @@ def __new__(
220225
data=None,
221226
freq=None,
222227
dtype: Dtype | None = None,
223-
copy: bool = False,
228+
copy: bool | None = None,
224229
name: Hashable | None = None,
225230
) -> Self:
226231
refs = None
@@ -231,6 +236,9 @@ def __new__(
231236

232237
freq = validate_dtype_freq(dtype, freq)
233238

239+
# GH#63388
240+
data, copy = cls._maybe_copy_array_input(data, copy, dtype)
241+
234242
# PeriodIndex allow PeriodIndex(period_index, freq=different)
235243
# Let's not encourage that kind of behavior in PeriodArray.
236244

pandas/core/indexes/timedeltas.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,13 @@ class TimedeltaIndex(DatetimeTimedeltaMixin):
8181
dtype : numpy.dtype or str, default None
8282
Valid ``numpy`` dtypes are ``timedelta64[ns]``, ``timedelta64[us]``,
8383
``timedelta64[ms]``, and ``timedelta64[s]``.
84-
copy : bool
85-
Make a copy of input array.
84+
copy : bool, default None
85+
Whether to copy input data, only relevant for array, Series, and Index
86+
inputs (for other input, e.g. a list, a new array is created anyway).
87+
Defaults to True for array input and False for Index/Series.
88+
Set to False to avoid copying array input at your own risk (if you
89+
know the input data won't be modified elsewhere).
90+
Set to True to force copying Series/Index input up front.
8691
name : object
8792
Name to be stored in the index.
8893
@@ -158,11 +163,14 @@ def __new__(
158163
data=None,
159164
freq=lib.no_default,
160165
dtype=None,
161-
copy: bool = False,
166+
copy: bool | None = None,
162167
name=None,
163168
):
164169
name = maybe_extract_name(name, data, cls)
165170

171+
# GH#63388
172+
data, copy = cls._maybe_copy_array_input(data, copy, dtype)
173+
166174
if is_scalar(data):
167175
cls._raise_scalar_data_error(data)
168176

pandas/io/html.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,7 @@
2424
AbstractMethodError,
2525
EmptyDataError,
2626
)
27-
from pandas.util._decorators import (
28-
doc,
29-
set_module,
30-
)
27+
from pandas.util._decorators import set_module
3128
from pandas.util._validators import check_dtype_backend
3229

3330
from pandas.core.dtypes.common import is_list_like
@@ -36,7 +33,6 @@
3633
from pandas.core.indexes.base import Index
3734
from pandas.core.indexes.multi import MultiIndex
3835
from pandas.core.series import Series
39-
from pandas.core.shared_docs import _shared_docs
4036

4137
from pandas.io.common import (
4238
get_handle,
@@ -1024,7 +1020,6 @@ def _parse(
10241020

10251021

10261022
@set_module("pandas")
1027-
@doc(storage_options=_shared_docs["storage_options"])
10281023
def read_html(
10291024
io: FilePath | ReadBuffer[str],
10301025
*,
@@ -1155,7 +1150,15 @@ def read_html(
11551150
11561151
.. versionadded:: 2.0
11571152
1158-
{storage_options}
1153+
storage_options : dict, optional
1154+
Extra options that make sense for a particular storage connection, e.g.
1155+
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
1156+
are forwarded to ``urllib.request.Request`` as header options. For other
1157+
URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
1158+
forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
1159+
details, and for more examples on storage options refer `here
1160+
<https://pandas.pydata.org/docs/user_guide/io.html?
1161+
highlight=storage_options#reading-writing-remote-files>`_.
11591162
11601163
.. versionadded:: 2.1.0
11611164

0 commit comments

Comments
 (0)