diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 9c986e5b1b054..0000000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,156 +0,0 @@ -version: 2.1 - -jobs: - test-linux-arm: - machine: - image: default - resource_class: arm.large - environment: - ENV_FILE: ci/deps/circle-311-arm64.yaml - PYTEST_WORKERS: auto - PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" - PYTEST_TARGET: "pandas" - PANDAS_CI: "1" - steps: - - checkout - - run: - name: Install Environment and Run Tests - shell: /bin/bash -exo pipefail - # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd - command: | - MINI_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh" - wget -q $MINI_URL -O Miniforge3.sh - chmod +x Miniforge3.sh - MINI_DIR="$HOME/miniconda3" - rm -rf $MINI_DIR - ./Miniforge3.sh -b -p $MINI_DIR - export PATH=$MINI_DIR/bin:$PATH - conda info -a - conda env create -q -n pandas-dev -f $ENV_FILE - conda list -n pandas-dev - source activate pandas-dev - if pip show pandas 1>/dev/null; then - pip uninstall -y pandas - fi - python -m pip install --no-build-isolation -ve . -Csetup-args="--werror" - PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH - sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 - ci/run_tests.sh - test-linux-musl: - docker: - - image: quay.io/pypa/musllinux_1_1_aarch64 - resource_class: arm.large - steps: - # Install pkgs first to have git in the image - # (needed for checkout) - - run: - name: Install System Packages - command: | - apk update - apk add git - apk add musl-locales - - checkout - - run: - name: Install Environment and Run Tests - command: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev - . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 - python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" - python -m pip list --no-cache-dir - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml - build-aarch64: - parameters: - cibw-build: - type: string - machine: - image: default - resource_class: arm.large - environment: - TRIGGER_SOURCE: << pipeline.trigger_source >> - steps: - - checkout - - run: - name: Check if build is necessary - command: | - # Check if tag is defined or TRIGGER_SOURCE is scheduled - if [[ -n "$CIRCLE_TAG" ]]; then - echo 'export IS_PUSH="true"' >> "$BASH_ENV" - elif [[ $TRIGGER_SOURCE == "scheduled_pipeline" ]]; then - echo 'export IS_SCHEDULE_DISPATCH="true"' >> "$BASH_ENV" - # Look for the build label/[wheel build] in commit - # grep takes a regex, so need to escape brackets - elif (git log --format=oneline -n 1 $CIRCLE_SHA1) | grep -q '\[wheel build\]'; then - : # Do nothing - elif ! (curl https://api.github.com/repos/pandas-dev/pandas/issues/$CIRCLE_PR_NUMBER | jq '.labels' | grep -q 'Build'); then - circleci-agent step halt - fi - - run: - name: Build aarch64 wheels - no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that - command: | - pip3 install cibuildwheel==2.20.0 - if [[ $CIBW_BUILD == cp313t* ]]; then - # TODO: temporarily run 3.13 free threaded builds without build isolation - # since we need pre-release cython - CIBW_BUILD_FRONTEND="pip; args: --no-build-isolation" cibuildwheel --output-dir wheelhouse - else - cibuildwheel --output-dir wheelhouse - fi - - environment: - CIBW_BUILD: << parameters.cibw-build >> - - - run: - name: Install Anaconda Client & Upload Wheels - shell: /bin/bash -exo pipefail - command: | - MINI_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh" - wget -q $MINI_URL -O Miniforge3.sh - chmod +x Miniforge3.sh - MINI_DIR="$HOME/miniconda3" - rm -rf $MINI_DIR - ./Miniforge3.sh -b -p $MINI_DIR - export PATH=$MINI_DIR/bin:$PATH - conda install -y -c conda-forge anaconda-client - source ci/upload_wheels.sh - set_upload_vars - upload_wheels - - store_artifacts: - path: wheelhouse/ - -workflows: - test: - # Don't run trigger this one when scheduled pipeline runs - when: - not: - equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] - jobs: - - test-linux-arm - test-musl: - # Don't run trigger this one when scheduled pipeline runs - when: - not: - equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] - jobs: - - test-linux-musl - build-wheels: - jobs: - - build-aarch64: - filters: - tags: - only: /^v.*/ - matrix: - parameters: - cibw-build: ["cp310-manylinux_aarch64", - "cp311-manylinux_aarch64", - "cp312-manylinux_aarch64", - "cp313-manylinux_aarch64", - "cp313t-manylinux_aarch64", - "cp310-musllinux_aarch64", - "cp311-musllinux_aarch64", - "cp312-musllinux_aarch64", - "cp313-musllinux_aarch64", - "cp313t-musllinux_aarch64"] diff --git a/.gitattributes b/.gitattributes index b3d70ca8b24fb..d94c19e7edb1f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -61,7 +61,6 @@ pandas/_version.py export-subst *.pxi export-ignore # Ignoring stuff from the top level -.circleci export-ignore .github export-ignore asv_bench export-ignore ci export-ignore @@ -85,4 +84,5 @@ pandas/tests/io/parser/data export-ignore # Include cibw script in sdist since it's needed for building wheels scripts/cibw_before_build.sh -export-ignore -scripts/cibw_before_test.sh -export-ignore +scripts/cibw_before_build_windows.sh -export-ignore +scripts/cibw_before_test_windows.sh -export-ignore diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 212ce7441dfab..08c41a1eeb21f 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -22,10 +22,11 @@ defaults: jobs: ubuntu: - runs-on: ubuntu-22.04 + runs-on: ${{ matrix.platform }} timeout-minutes: 90 strategy: matrix: + platform: [ubuntu-22.04, ubuntu-24.04-arm] env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] @@ -35,9 +36,11 @@ jobs: env_file: actions-311-downstream_compat.yaml pattern: "not slow and not network and not single_cpu" pytest_target: "pandas/tests/test_downstream.py" + platform: ubuntu-22.04 - name: "Minimum Versions" env_file: actions-310-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" + platform: ubuntu-22.04 - name: "Locale: it_IT" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -48,6 +51,7 @@ jobs: # Also install it_IT (its encoding is ISO8859-1) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "it_IT" + platform: ubuntu-22.04 - name: "Locale: zh_CN" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -58,25 +62,32 @@ jobs: # Also install zh_CN (its encoding is gb2312) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "zh_CN" + platform: ubuntu-22.04 - name: "Future infer strings" env_file: actions-312.yaml pandas_future_infer_string: "1" + platform: ubuntu-22.04 - name: "Future infer strings (without pyarrow)" env_file: actions-311.yaml pandas_future_infer_string: "1" + platform: ubuntu-22.04 - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" test_args: "--max-worker-restart 0" + platform: ubuntu-22.04 - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" test_args: "-W error::DeprecationWarning -W error::FutureWarning" + platform: ubuntu-22.04 - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" + pandas_future_infer_string: "1" + platform: ubuntu-22.04 fail-fast: false - name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} + name: ${{ matrix.name || format('{0} {1}', matrix.platform, matrix.env_file) }} env: PATTERN: ${{ matrix.pattern }} LANG: ${{ matrix.lang || 'C.UTF-8' }} @@ -91,12 +102,12 @@ jobs: REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }} + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }}-${{ matrix.platform }} cancel-in-progress: true services: mysql: - image: mysql:8 + image: mysql:9 env: MYSQL_ALLOW_EMPTY_PASSWORD: yes MYSQL_DATABASE: pandas @@ -109,7 +120,7 @@ jobs: - 3306:3306 postgres: - image: postgres:16 + image: postgres:17 env: PGUSER: postgres POSTGRES_USER: postgres @@ -124,7 +135,7 @@ jobs: - 5432:5432 moto: - image: motoserver/moto:5.0.0 + image: motoserver/moto:5.0.27 env: AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret @@ -231,15 +242,14 @@ jobs: - name: Build environment and Run Tests # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388 run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + /opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy -Csetup-args="-Dallow-noblas=true" python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit @@ -248,7 +258,7 @@ jobs: Linux-Musl: runs-on: ubuntu-22.04 container: - image: quay.io/pypa/musllinux_1_1_x86_64 + image: quay.io/pypa/musllinux_1_2_x86_64 steps: - name: Checkout pandas Repo # actions/checkout does not work since it requires node @@ -270,7 +280,7 @@ jobs: apk add musl-locales - name: Build environment run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + /opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 @@ -280,8 +290,7 @@ jobs: - name: Run Tests run: | . ~/virtualenvs/pandas-dev/bin/activate - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-musl @@ -346,8 +355,7 @@ jobs: python --version python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy - python -m pip install versioneer[toml] - python -m pip install python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install versioneer[toml] python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" python -m pip list @@ -364,7 +372,7 @@ jobs: concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-python-freethreading-dev + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-python-freethreading-dev cancel-in-progress: true env: @@ -387,10 +395,9 @@ jobs: - name: Build Environment run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 - python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython - python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install --upgrade pip setuptools wheel numpy meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython + python -m pip install versioneer[toml] python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" python -m pip list @@ -417,20 +424,20 @@ jobs: with: fetch-depth: 0 - - name: Set up Python for Pyodide + - name: Set up Python for pyodide-build id: setup-python uses: actions/setup-python@v5 with: - python-version: '3.11.3' + python-version: '3.12' - name: Set up Emscripten toolchain uses: mymindstorm/setup-emsdk@v14 with: - version: '3.1.46' + version: '3.1.58' actions-cache-folder: emsdk-cache - name: Install pyodide-build - run: pip install "pyodide-build==0.25.1" + run: pip install "pyodide-build>=0.29.2" - name: Build pandas for Pyodide run: | @@ -439,10 +446,13 @@ jobs: - name: Set up Node.js uses: actions/setup-node@v4 with: - node-version: '18' + node-version: '20' - name: Set up Pyodide virtual environment + env: + pyodide-version: '0.27.1' run: | + pyodide xbuildenv install ${{ env.pyodide-version }} pyodide venv .venv-pyodide source .venv-pyodide/bin/activate pip install dist/*.whl diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 4bff9e7e090da..a4c2a732f9fc8 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -94,7 +94,8 @@ jobs: buildplat: - [ubuntu-22.04, manylinux_x86_64] - [ubuntu-22.04, musllinux_x86_64] - - [macos-12, macosx_x86_64] + - [ubuntu-24.04-arm, manylinux_aarch64] + - [macos-13, macosx_x86_64] # Note: M1 images on Github Actions start from macOS 14 - [macos-14, macosx_arm64] - [windows-2022, win_amd64] @@ -111,10 +112,6 @@ jobs: - buildplat: [ubuntu-22.04, pyodide_wasm32] python: ["cp312", "3.12"] cibw_build_frontend: 'build' - # TODO: Build free-threaded wheels for Windows - exclude: - - buildplat: [windows-2022, win_amd64] - python: ["cp313t", "3.13"] env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} @@ -156,7 +153,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.21.3 + uses: pypa/cibuildwheel@v2.22.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: @@ -181,20 +178,6 @@ jobs: shell: bash -el {0} run: for whl in $(ls wheelhouse); do wheel unpack wheelhouse/$whl -d /tmp; done - # Testing on windowsservercore instead of GHA runner to fail on missing DLLs - - name: Test Windows Wheels - if: ${{ matrix.buildplat[1] == 'win_amd64' }} - shell: pwsh - run: | - $TST_CMD = @" - python -m pip install hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0; - python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); - python -c `'import pandas as pd; pd.test(extra_args=[`\"--no-strict-data-files`\", `\"-m not clipboard and not single_cpu and not slow and not network and not db`\"])`'; - "@ - # add rc to the end of the image name if the Python version is unreleased - docker pull python:${{ matrix.python[1] == '3.13' && '3.13-rc' || format('{0}-windowsservercore', matrix.python[1]) }} - docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.13' && '3.13-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD - - uses: actions/upload-artifact@v4 with: name: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} diff --git a/.gitignore b/.gitignore index a188e216d9f70..d951f3fb9cbad 100644 --- a/.gitignore +++ b/.gitignore @@ -137,3 +137,7 @@ doc/source/savefig/ # Interactive terminal generated files # ######################################## .jupyterlite.doit.db + +# Pyodide/WASM related files # +############################## +/.pyodide-xbuildenv-* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 87212309725c7..1dd8dfc54111e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.9 + rev: v0.8.6 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -34,7 +34,7 @@ repos: - id: ruff-format exclude: ^scripts|^pandas/tests/frame/test_query_eval.py - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.13' + rev: 'v2.14' hooks: - id: vulture entry: python scripts/run_vulture.py @@ -47,7 +47,7 @@ repos: types_or: [python, rst, markdown, cython, c] additional_dependencies: [tomli] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.16.2 + rev: v0.16.6 hooks: - id: cython-lint - id: double-quote-cython-strings @@ -74,7 +74,7 @@ repos: hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.17.0 + rev: v3.19.1 hooks: - id: pyupgrade args: [--py310-plus] @@ -95,12 +95,17 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.1 + rev: v19.1.6 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] +- repo: https://github.com/trim21/pre-commit-mirror-meson + rev: v1.6.1 + hooks: + - id: meson-fmt + args: ['--inplace'] - repo: local hooks: - id: pyright diff --git a/LICENSE b/LICENSE index 2d1c34fd9d7fc..c343da2ebe870 100644 --- a/LICENSE +++ b/LICENSE @@ -3,7 +3,7 @@ BSD 3-Clause License Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team All rights reserved. -Copyright (c) 2011-2024, Open source contributors. +Copyright (c) 2011-2025, Open source contributors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/MANIFEST.in b/MANIFEST.in index a7d7d7eb4e062..c59151f340545 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -65,3 +65,5 @@ graft pandas/_libs/include # Include cibw script in sdist since it's needed for building wheels include scripts/cibw_before_build.sh +include scripts/cibw_before_build_windows.sh +include scripts/cibw_before_test_windows.sh diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index abffa1f702b9c..19c556dfe9d1f 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -511,8 +511,7 @@ def setup(self, dtype, method, application, ncols, engine): # grouping on multiple columns # and we lack kernels for a bunch of methods if ( - engine == "numba" - and method in _numba_unsupported_methods + (engine == "numba" and method in _numba_unsupported_methods) or ncols > 1 or application == "transformation" or dtype == "datetime" diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index ff0ccffced0f3..3a15f754ae523 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -594,7 +594,7 @@ def setup(self): self.StringIO_input = StringIO(data) def time_read_csv_index_col(self): - read_csv(self.StringIO_input, index_col="a") + read_csv(self.data(self.StringIO_input), index_col="a") class ReadCSVDatePyarrowEngine(StringIORewind): @@ -605,7 +605,7 @@ def setup(self): def time_read_csv_index_col(self): read_csv( - self.StringIO_input, + self.data(self.StringIO_input), parse_dates=["a"], engine="pyarrow", dtype_backend="pyarrow", diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 768e05b16cfe9..ee5b7eb4f09fb 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -72,77 +72,16 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.RangeIndex.from_range PR01,SA01" \ - -i "pandas.Series.dt.freq GL08" \ - -i "pandas.Series.dt.unit GL08" \ - -i "pandas.Series.pad PR01,SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ -i "pandas.Timestamp.max PR02" \ -i "pandas.Timestamp.min PR02" \ - -i "pandas.Timestamp.nanosecond GL08" \ -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ - -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.types.is_re_compilable PR07,SA01" \ - -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ - -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ - -i "pandas.arrays.IntegerArray SA01" \ - -i "pandas.arrays.IntervalArray.left SA01" \ - -i "pandas.arrays.IntervalArray.length SA01" \ - -i "pandas.arrays.IntervalArray.right SA01" \ - -i "pandas.arrays.NumpyExtensionArray SA01" \ - -i "pandas.arrays.SparseArray PR07,SA01" \ - -i "pandas.arrays.TimedeltaArray PR07,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ - -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ - -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.groups SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ - -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ - -i "pandas.core.resample.Resampler.get_group RT03,SA01" \ - -i "pandas.core.resample.Resampler.groups SA01" \ - -i "pandas.core.resample.Resampler.indices SA01" \ - -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \ - -i "pandas.core.resample.Resampler.mean SA01" \ - -i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \ - -i "pandas.core.resample.Resampler.prod SA01" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ - -i "pandas.core.resample.Resampler.sem SA01" \ - -i "pandas.core.resample.Resampler.std SA01" \ - -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ - -i "pandas.core.resample.Resampler.var SA01" \ - -i "pandas.errors.AttributeConflictWarning SA01" \ - -i "pandas.errors.ChainedAssignmentError SA01" \ - -i "pandas.errors.DataError SA01" \ - -i "pandas.errors.DuplicateLabelError SA01" \ - -i "pandas.errors.IntCastingNaNError SA01" \ - -i "pandas.errors.InvalidIndexError SA01" \ - -i "pandas.errors.NullFrequencyError SA01" \ - -i "pandas.errors.NumExprClobberingError SA01" \ - -i "pandas.errors.NumbaUtilError SA01" \ - -i "pandas.errors.OutOfBoundsTimedelta SA01" \ - -i "pandas.errors.PerformanceWarning SA01" \ - -i "pandas.errors.PossibleDataLossError SA01" \ - -i "pandas.errors.UndefinedVariableError PR01,SA01" \ - -i "pandas.errors.UnsortedIndexError SA01" \ - -i "pandas.errors.ValueLabelTypeMismatch SA01" \ - -i "pandas.infer_freq SA01" \ - -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ - -i "pandas.io.stata.StataWriter.write_file SA01" \ - -i "pandas.plotting.andrews_curves RT03,SA01" \ - -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ @@ -206,7 +145,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.CustomBusinessMonthBegin PR02" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.calendar GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.holidays GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthBegin.is_on_offset SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.m_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.n GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.normalize GL08" \ @@ -214,7 +152,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.CustomBusinessMonthEnd PR02" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.calendar GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.holidays GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthEnd.is_on_offset SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.m_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.n GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.normalize GL08" \ @@ -251,7 +188,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.Hour.is_on_offset GL08" \ -i "pandas.tseries.offsets.Hour.n GL08" \ -i "pandas.tseries.offsets.Hour.normalize GL08" \ - -i "pandas.tseries.offsets.LastWeekOfMonth SA01" \ -i "pandas.tseries.offsets.LastWeekOfMonth.is_on_offset GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.n GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.normalize GL08" \ diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 22e4907e5a6e5..2d3d11c294e12 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -23,7 +23,7 @@ dependencies: - pip: - "tzdata>=2022.7" - - "--extra-index-url https://pypi.fury.io/arrow-nightlies/" + - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--prefer-binary" - "--pre" - "pyarrow" diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml deleted file mode 100644 index 3f09e27d0fe4b..0000000000000 --- a/ci/deps/circle-311-arm64.yaml +++ /dev/null @@ -1,61 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.11 - - # build dependencies - - versioneer - - cython>=0.29.33 - - meson=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=3.4.0 - - pytest-localserver>=0.8.1 - - pytest-qt>=4.4.0 - - boto3 - - # required dependencies - - python-dateutil - - numpy - - # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 - - bottleneck>=1.3.6 - - fastparquet>=2023.10.0 - - fsspec>=2022.11.0 - - html5lib>=1.1 - - hypothesis>=6.84.0 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 - - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 - - odfpy>=1.4.1 - - qtpy>=2.3.0 - - openpyxl>=3.1.0 - - psycopg2>=2.9.6 - - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyqt>=5.15.9 - - pyreadstat>=1.2.0 - - pytables>=3.8.0 - - python-calamine>=0.1.7 - - pytz>=2023.4 - - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 - - sqlalchemy>=2.0.0 - - tabulate>=0.9.0 - - xarray>=2022.12.0, <2024.10.0 - - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 - - pip: - - adbc-driver-postgresql>=0.8.0 - - adbc-driver-sqlite>=0.8.0 diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf new file mode 100644 index 0000000000000..ea356385e9fb1 Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx new file mode 100644 index 0000000000000..1b812c1a2595a Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx differ diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md index 6c33de104ed90..b8599acff2f6e 100644 --- a/doc/cheatsheet/README.md +++ b/doc/cheatsheet/README.md @@ -6,10 +6,12 @@ and pick "PDF" as the format. This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf). -| Topic | PDF | PPT | -|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Pandas_Cheat_Sheet | | | -| Pandas_Cheat_Sheet_JA | | | +| Topic | Language | PDF | PPT | +|------------------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Pandas_Cheat_Sheet | English | | | +| Pandas_Cheat_Sheet_JA | Japanese | | | +| Pandas_Cheat_Sheet_FA | Persian | | | + **Alternative** diff --git a/doc/source/conf.py b/doc/source/conf.py index ddbda0aa3bf65..677ee6274b093 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -242,7 +242,6 @@ "external_links": [], "footer_start": ["pandas_footer", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", - "twitter_url": "https://twitter.com/pandas_dev", "analytics": { "plausible_analytics_domain": "pandas.pydata.org", "plausible_analytics_url": "https://views.scientific-python.org/js/script.js", @@ -258,6 +257,11 @@ # patch version doesn't compare as equal (e.g. 2.2.1 != 2.2.0 but it should be) "show_version_warning_banner": False, "icon_links": [ + { + "name": "X", + "url": "https://x.com/pandas_dev", + "icon": "fa-brands fa-square-x-twitter", + }, { "name": "Mastodon", "url": "https://fosstodon.org/@pandas_dev", diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index e174eea00ca60..59d7957275e15 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -940,7 +940,7 @@ Finally, docstrings can also be appended to with the ``doc`` decorator. In this example, we'll create a parent docstring normally (this is like ``pandas.core.generic.NDFrame``). Then we'll have two children (like -``pandas.core.series.Series`` and ``pandas.DataFrame``). We'll +``pandas.Series`` and ``pandas.DataFrame``). We'll substitute the class names in this docstring. .. code-block:: python @@ -995,5 +995,5 @@ mapping function names to docstrings. Wherever possible, we prefer using ``doc``, since the docstring-writing processes is slightly closer to normal. See ``pandas.core.generic.NDFrame.fillna`` for an example template, and -``pandas.core.series.Series.fillna`` and ``pandas.core.generic.frame.fillna`` +``pandas.Series.fillna`` and ``pandas.core.generic.frame.fillna`` for the filled versions. diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 1426d3a84a748..98bd4b00d016b 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -35,6 +35,10 @@ You will need `Build Tools for Visual Studio 2022 scrolling down to "All downloads" -> "Tools for Visual Studio". In the installer, select the "Desktop development with C++" Workloads. + If you encounter an error indicating ``cl.exe`` is not found when building with Meson, + reopen the installer and also select the optional component + **MSVC v142 - VS 2019 C++ x64/x86 build tools** in the right pane for installation. + Alternatively, you can install the necessary components on the commandline using `vs_BuildTools.exe `_ diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 1e4a851d0e72d..c572559dcc3e0 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -488,7 +488,7 @@ Post-Release for reference): - The pandas-dev and pydata mailing lists - - Twitter, Mastodon, Telegram and LinkedIn + - X, Mastodon, Telegram and LinkedIn 7. Update this release instructions to fix anything incorrect and to update about any change since the last release. diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index 25ba237e8caf3..d9d7d916b0238 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -405,7 +405,7 @@ In Python, this list would be a list of tuples, so a = list(enumerate(list(range(1, 5)) + [np.NAN])) pd.DataFrame(a) -For more details and examples see :ref:`the Into to Data Structures +For more details and examples see :ref:`the Intro to Data Structures documentation `. meltdf diff --git a/doc/source/getting_started/comparison/comparison_with_spss.rst b/doc/source/getting_started/comparison/comparison_with_spss.rst new file mode 100644 index 0000000000000..12c64bfd180a3 --- /dev/null +++ b/doc/source/getting_started/comparison/comparison_with_spss.rst @@ -0,0 +1,229 @@ +.. _compare_with_spss: + +{{ header }} + +Comparison with SPSS +******************** +For potential users coming from `SPSS `__, this page is meant to demonstrate +how various SPSS operations would be performed using pandas. + +.. include:: includes/introduction.rst + +Data structures +--------------- + +General terminology translation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. csv-table:: + :header: "pandas", "SPSS" + :widths: 20, 20 + + :class:`DataFrame`, data file + column, variable + row, case + groupby, split file + :class:`NaN`, system-missing + +:class:`DataFrame` +~~~~~~~~~~~~~~~~~~ + +A :class:`DataFrame` in pandas is analogous to an SPSS data file - a two-dimensional +data source with labeled columns that can be of different types. As will be shown in this +document, almost any operation that can be performed in SPSS can also be accomplished in pandas. + +:class:`Series` +~~~~~~~~~~~~~~~ + +A :class:`Series` is the data structure that represents one column of a :class:`DataFrame`. SPSS doesn't have a +separate data structure for a single variable, but in general, working with a :class:`Series` is analogous +to working with a variable in SPSS. + +:class:`Index` +~~~~~~~~~~~~~~ + +Every :class:`DataFrame` and :class:`Series` has an :class:`Index` -- labels on the *rows* of the data. SPSS does not +have an exact analogue, as cases are simply numbered sequentially from 1. In pandas, if no index is +specified, a :class:`RangeIndex` is used by default (first row = 0, second row = 1, and so on). + +While using a labeled :class:`Index` or :class:`MultiIndex` can enable sophisticated analyses and is ultimately an +important part of pandas to understand, for this comparison we will essentially ignore the :class:`Index` and +just treat the :class:`DataFrame` as a collection of columns. Please see the :ref:`indexing documentation` +for much more on how to use an :class:`Index` effectively. + + +Copies vs. in place operations +------------------------------ + +.. include:: includes/copies.rst + + +Data input / output +------------------- + +Reading external data +~~~~~~~~~~~~~~~~~~~~~ + +Like SPSS, pandas provides utilities for reading in data from many formats. The ``tips`` dataset, found within +the pandas tests (`csv `_) +will be used in many of the following examples. + +In SPSS, you would use File > Open > Data to import a CSV file: + +.. code-block:: text + + FILE > OPEN > DATA + /TYPE=CSV + /FILE='tips.csv' + /DELIMITERS="," + /FIRSTCASE=2 + /VARIABLES=col1 col2 col3. + +The pandas equivalent would use :func:`read_csv`: + +.. code-block:: python + + url = ( + "https://raw.githubusercontent.com/pandas-dev" + "/pandas/main/pandas/tests/io/data/csv/tips.csv" + ) + tips = pd.read_csv(url) + tips + +Like SPSS's data import wizard, ``read_csv`` can take a number of parameters to specify how the data should be parsed. +For example, if the data was instead tab delimited, and did not have column names, the pandas command would be: + +.. code-block:: python + + tips = pd.read_csv("tips.csv", sep="\t", header=None) + + # alternatively, read_table is an alias to read_csv with tab delimiter + tips = pd.read_table("tips.csv", header=None) + + +Data operations +--------------- + +Filtering +~~~~~~~~~ + +In SPSS, filtering is done through Data > Select Cases: + +.. code-block:: text + + SELECT IF (total_bill > 10). + EXECUTE. + +In pandas, boolean indexing can be used: + +.. code-block:: python + + tips[tips["total_bill"] > 10] + + +Sorting +~~~~~~~ + +In SPSS, sorting is done through Data > Sort Cases: + +.. code-block:: text + + SORT CASES BY sex total_bill. + EXECUTE. + +In pandas, this would be written as: + +.. code-block:: python + + tips.sort_values(["sex", "total_bill"]) + + +String processing +----------------- + +Finding length of string +~~~~~~~~~~~~~~~~~~~~~~~~ + +In SPSS: + +.. code-block:: text + + COMPUTE length = LENGTH(time). + EXECUTE. + +.. include:: includes/length.rst + + +Changing case +~~~~~~~~~~~~~ + +In SPSS: + +.. code-block:: text + + COMPUTE upper = UPCASE(time). + COMPUTE lower = LOWER(time). + EXECUTE. + +.. include:: includes/case.rst + + +Merging +------- + +In SPSS, merging data files is done through Data > Merge Files. + +.. include:: includes/merge_setup.rst +.. include:: includes/merge.rst + + +GroupBy operations +------------------ + +Split-file processing +~~~~~~~~~~~~~~~~~~~~~ + +In SPSS, split-file analysis is done through Data > Split File: + +.. code-block:: text + + SORT CASES BY sex. + SPLIT FILE BY sex. + DESCRIPTIVES VARIABLES=total_bill tip + /STATISTICS=MEAN STDDEV MIN MAX. + +The pandas equivalent would be: + +.. code-block:: python + + tips.groupby("sex")[["total_bill", "tip"]].agg(["mean", "std", "min", "max"]) + + +Missing data +------------ + +SPSS uses the period (``.``) for numeric missing values and blank spaces for string missing values. +pandas uses ``NaN`` (Not a Number) for numeric missing values and ``None`` or ``NaN`` for string +missing values. + +.. include:: includes/missing.rst + + +Other considerations +-------------------- + +Output management +----------------- + +While pandas does not have a direct equivalent to SPSS's Output Management System (OMS), you can +capture and export results in various ways: + +.. code-block:: python + + # Save summary statistics to CSV + tips.groupby('sex')[['total_bill', 'tip']].mean().to_csv('summary.csv') + + # Save multiple results to Excel sheets + with pd.ExcelWriter('results.xlsx') as writer: + tips.describe().to_excel(writer, sheet_name='Descriptives') + tips.groupby('sex').mean().to_excel(writer, sheet_name='Means by Gender') diff --git a/doc/source/getting_started/comparison/index.rst b/doc/source/getting_started/comparison/index.rst index c3f58ce1f3d6d..3133d74afa3db 100644 --- a/doc/source/getting_started/comparison/index.rst +++ b/doc/source/getting_started/comparison/index.rst @@ -14,3 +14,4 @@ Comparison with other tools comparison_with_spreadsheets comparison_with_sas comparison_with_stata + comparison_with_spss diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index b3982c4ad091f..bda959f380e8a 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -193,25 +193,25 @@ Visualization Installable with ``pip install "pandas[plot, output-formatting]"``. -========================= ================== ================== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== ================== ============================================================= -matplotlib 3.6.3 plot Plotting library -Jinja2 3.1.2 output-formatting Conditional formatting with DataFrame.style -tabulate 0.9.0 output-formatting Printing in Markdown-friendly format (see `tabulate`_) -========================= ================== ================== ============================================================= +========================================================== ================== ================== ======================================================= +Dependency Minimum Version pip extra Notes +========================================================== ================== ================== ======================================================= +`matplotlib `__ 3.6.3 plot Plotting library +`Jinja2 `__ 3.1.2 output-formatting Conditional formatting with DataFrame.style +`tabulate `__ 0.9.0 output-formatting Printing in Markdown-friendly format (see `tabulate`_) +========================================================== ================== ================== ======================================================= Computation ^^^^^^^^^^^ Installable with ``pip install "pandas[computation]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -SciPy 1.10.0 computation Miscellaneous statistical functions -xarray 2022.12.0 computation pandas-like API for N-dimensional data -========================= ================== =============== ============================================================= +============================================== ================== =============== ======================================= +Dependency Minimum Version pip extra Notes +============================================== ================== =============== ======================================= +`SciPy `__ 1.10.0 computation Miscellaneous statistical functions +`xarray `__ 2022.12.0 computation pandas-like API for N-dimensional data +============================================== ================== =============== ======================================= .. _install.excel_dependencies: @@ -220,29 +220,29 @@ Excel files Installable with ``pip install "pandas[excel]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -xlrd 2.0.1 excel Reading for xls files -xlsxwriter 3.0.5 excel Writing for xlsx files -openpyxl 3.1.0 excel Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files -pyxlsb 1.0.10 excel Reading for xlsb files -python-calamine 0.1.7 excel Reading for xls/xlsx/xlsm/xlsb/xla/xlam/ods files -odfpy 1.4.1 excel Reading / writing for OpenDocument 1.2 files -========================= ================== =============== ============================================================= +================================================================== ================== =============== ============================================================= +Dependency Minimum Version pip extra Notes +================================================================== ================== =============== ============================================================= +`xlrd `__ 2.0.1 excel Reading for xls files +`xlsxwriter `__ 3.0.5 excel Writing for xlsx files +`openpyxl `__ 3.1.0 excel Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files +`pyxlsb `__ 1.0.10 excel Reading for xlsb files +`python-calamine `__ 0.1.7 excel Reading for xls/xlsx/xlsm/xlsb/xla/xlam/ods files +`odfpy `__ 1.4.1 excel Reading / writing for OpenDocument 1.2 files +================================================================== ================== =============== ============================================================= HTML ^^^^ Installable with ``pip install "pandas[html]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -BeautifulSoup4 4.11.2 html HTML parser for read_html -html5lib 1.1 html HTML parser for read_html -lxml 4.9.2 html HTML parser for read_html -========================= ================== =============== ============================================================= +=============================================================== ================== =============== ========================== +Dependency Minimum Version pip extra Notes +=============================================================== ================== =============== ========================== +`BeautifulSoup4 `__ 4.11.2 html HTML parser for read_html +`html5lib `__ 1.1 html HTML parser for read_html +`lxml `__ 4.9.2 html HTML parser for read_html +=============================================================== ================== =============== ========================== One of the following combinations of libraries is needed to use the top-level :func:`~pandas.read_html` function: @@ -273,45 +273,45 @@ XML Installable with ``pip install "pandas[xml]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -lxml 4.9.2 xml XML parser for read_xml and tree builder for to_xml -========================= ================== =============== ============================================================= +======================================== ================== =============== ==================================================== +Dependency Minimum Version pip extra Notes +======================================== ================== =============== ==================================================== +`lxml `__ 4.9.2 xml XML parser for read_xml and tree builder for to_xml +======================================== ================== =============== ==================================================== SQL databases ^^^^^^^^^^^^^ Traditional drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"`` -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -SQLAlchemy 2.0.0 postgresql, SQL support for databases other than sqlite - mysql, - sql-other -psycopg2 2.9.6 postgresql PostgreSQL engine for sqlalchemy -pymysql 1.0.2 mysql MySQL engine for sqlalchemy -adbc-driver-postgresql 0.10.0 postgresql ADBC Driver for PostgreSQL -adbc-driver-sqlite 0.8.0 sql-other ADBC Driver for SQLite -========================= ================== =============== ============================================================= +================================================================== ================== =============== ============================================ +Dependency Minimum Version pip extra Notes +================================================================== ================== =============== ============================================ +`SQLAlchemy `__ 2.0.0 postgresql, SQL support for databases other than sqlite + mysql, + sql-other +`psycopg2 `__ 2.9.6 postgresql PostgreSQL engine for sqlalchemy +`pymysql `__ 1.0.2 mysql MySQL engine for sqlalchemy +`adbc-driver-postgresql `__ 0.10.0 postgresql ADBC Driver for PostgreSQL +`adbc-driver-sqlite `__ 0.8.0 sql-other ADBC Driver for SQLite +================================================================== ================== =============== ============================================ Other data sources ^^^^^^^^^^^^^^^^^^ Installable with ``pip install "pandas[hdf5, parquet, feather, spss, excel]"`` -========================= ================== ================ ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== ================ ============================================================= -PyTables 3.8.0 hdf5 HDF5-based reading / writing -blosc 1.21.3 hdf5 Compression for HDF5; only available on ``conda`` -zlib hdf5 Compression for HDF5 -fastparquet 2023.10.0 - Parquet reading / writing (pyarrow is default) -pyarrow 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing -pyreadstat 1.2.0 spss SPSS files (.sav) reading -odfpy 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing -========================= ================== ================ ============================================================= +====================================================== ================== ================ ========================================================== +Dependency Minimum Version pip extra Notes +====================================================== ================== ================ ========================================================== +`PyTables `__ 3.8.0 hdf5 HDF5-based reading / writing +`blosc `__ 1.21.3 hdf5 Compression for HDF5; only available on ``conda`` +`zlib `__ hdf5 Compression for HDF5 +`fastparquet `__ 2023.10.0 - Parquet reading / writing (pyarrow is default) +`pyarrow `__ 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing +`pyreadstat `__ 1.2.0 spss SPSS files (.sav) reading +`odfpy `__ 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing +====================================================== ================== ================ ========================================================== .. _install.warn_orc: @@ -326,26 +326,26 @@ Access data in the cloud Installable with ``pip install "pandas[fss, aws, gcp]"`` -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -fsspec 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required - dependency of s3fs, gcsfs). -gcsfs 2022.11.0 gcp Google Cloud Storage access -s3fs 2022.11.0 aws Amazon S3 access -========================= ================== =============== ============================================================= +============================================ ================== =============== ========================================================== +Dependency Minimum Version pip extra Notes +============================================ ================== =============== ========================================================== +`fsspec `__ 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required + dependency of s3fs, gcsfs). +`gcsfs `__ 2022.11.0 gcp Google Cloud Storage access +`s3fs `__ 2022.11.0 aws Amazon S3 access +============================================ ================== =============== ========================================================== Clipboard ^^^^^^^^^ Installable with ``pip install "pandas[clipboard]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -PyQt4/PyQt5 5.15.9 clipboard Clipboard I/O -qtpy 2.3.0 clipboard Clipboard I/O -========================= ================== =============== ============================================================= +======================================================================================== ================== =============== ============== +Dependency Minimum Version pip extra Notes +======================================================================================== ================== =============== ============== +`PyQt4 `__/`PyQt5 `__ 5.15.9 clipboard Clipboard I/O +`qtpy `__ 2.3.0 clipboard Clipboard I/O +======================================================================================== ================== =============== ============== .. note:: @@ -358,19 +358,19 @@ Compression Installable with ``pip install "pandas[compression]"`` -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -Zstandard 0.19.0 compression Zstandard compression -========================= ================== =============== ============================================================= +================================================= ================== =============== ====================== +Dependency Minimum Version pip extra Notes +================================================= ================== =============== ====================== +`Zstandard `__ 0.19.0 compression Zstandard compression +================================================= ================== =============== ====================== Timezone ^^^^^^^^ Installable with ``pip install "pandas[timezone]"`` -========================= ================== =================== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =================== ============================================================= -pytz 2023.4 timezone Alternative timezone library to ``zoneinfo``. -========================= ================== =================== ============================================================= +========================================== ================== =================== ============================================== +Dependency Minimum Version pip extra Notes +========================================== ================== =================== ============================================== +`pytz `__ 2023.4 timezone Alternative timezone library to ``zoneinfo``. +========================================== ================== =================== ============================================== diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index 4393c3716bdad..eae7771418485 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -112,7 +112,7 @@ Various tutorials * `Wes McKinney's (pandas BDFL) blog `_ * `Statistical analysis made easy in Python with SciPy and pandas DataFrames, by Randal Olson `_ -* `Statistical Data Analysis in Python, tutorial videos, by Christopher Fonnesbeck from SciPy 2013 `_ +* `Statistical Data Analysis in Python, tutorial by Christopher Fonnesbeck from SciPy 2013 `_ * `Financial analysis in Python, by Thomas Wiecki `_ * `Intro to pandas data structures, by Greg Reda `_ * `Pandas DataFrames Tutorial, by Karlijn Willems `_ diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 7680c8b434866..e701d48a89db7 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -185,7 +185,6 @@ Reindexing / selection / label manipulation DataFrame.duplicated DataFrame.equals DataFrame.filter - DataFrame.head DataFrame.idxmax DataFrame.idxmin DataFrame.reindex @@ -196,7 +195,6 @@ Reindexing / selection / label manipulation DataFrame.sample DataFrame.set_axis DataFrame.set_index - DataFrame.tail DataFrame.take DataFrame.truncate diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index 3b02ffe20c10e..fc180c8161a7e 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -104,6 +104,7 @@ Function application DataFrameGroupBy.shift DataFrameGroupBy.size DataFrameGroupBy.skew + DataFrameGroupBy.kurt DataFrameGroupBy.std DataFrameGroupBy.sum DataFrameGroupBy.var @@ -159,6 +160,7 @@ Function application SeriesGroupBy.shift SeriesGroupBy.size SeriesGroupBy.skew + SeriesGroupBy.kurt SeriesGroupBy.std SeriesGroupBy.sum SeriesGroupBy.var diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 0e1d93841d52f..742263c788c2f 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -27,6 +27,7 @@ Styler properties Styler.template_html_style Styler.template_html_table Styler.template_latex + Styler.template_typst Styler.template_string Styler.loader @@ -77,6 +78,7 @@ Style export and import Styler.to_html Styler.to_latex + Styler.to_typst Styler.to_excel Styler.to_string Styler.export diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index 14af2b8a120e0..2aeb57faac112 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -30,11 +30,14 @@ Rolling window functions Rolling.std Rolling.min Rolling.max + Rolling.first + Rolling.last Rolling.corr Rolling.cov Rolling.skew Rolling.kurt Rolling.apply + Rolling.pipe Rolling.aggregate Rolling.quantile Rolling.sem @@ -71,11 +74,14 @@ Expanding window functions Expanding.std Expanding.min Expanding.max + Expanding.first + Expanding.last Expanding.corr Expanding.cov Expanding.skew Expanding.kurt Expanding.apply + Expanding.pipe Expanding.aggregate Expanding.quantile Expanding.sem diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 1525afcac87f7..b2b5c5cc1014e 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -459,7 +459,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df # List the size of the animals with the highest weight. - df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()], include_groups=False) + df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()]) `Using get_group `__ @@ -482,7 +482,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"]) - expected_df = gb.apply(GrowUp, include_groups=False) + expected_df = gb.apply(GrowUp) expected_df `Expanding apply diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index b9c285ca30c96..89981786d60b5 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -326,7 +326,7 @@ This case is handled identically to a dict of arrays. .. ipython:: python - data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")]) + data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "S10")]) data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] pd.DataFrame(data) diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 842f30f06676e..e85eead4e0f09 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -372,5 +372,5 @@ constructors using something similar to the following: s = pd.Series(newx) See `the NumPy documentation on byte order -`__ for more +`__ for more details. diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index acb5a2b7919ac..4a32381a7de47 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1074,7 +1074,7 @@ missing values with the ``ffill()`` method. ).set_index("date") df_re - df_re.groupby("group").resample("1D", include_groups=False).ffill() + df_re.groupby("group").resample("1D").ffill() .. _groupby.filter: @@ -1252,13 +1252,13 @@ the argument ``group_keys`` which defaults to ``True``. Compare .. ipython:: python - df.groupby("A", group_keys=True).apply(lambda x: x, include_groups=False) + df.groupby("A", group_keys=True).apply(lambda x: x) with .. ipython:: python - df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False) + df.groupby("A", group_keys=False).apply(lambda x: x) Numba accelerated routines @@ -1742,7 +1742,7 @@ column index name will be used as the name of the inserted column: result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} return pd.Series(result, name="metrics") - result = df.groupby("a").apply(compute_metrics, include_groups=False) + result = df.groupby("a").apply(compute_metrics) result diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 503f7cc7cbe73..ed5c7806b2e23 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -858,9 +858,10 @@ and :ref:`Advanced Indexing ` you may select along more than one axis .. warning:: - ``iloc`` supports two kinds of boolean indexing. If the indexer is a boolean ``Series``, - an error will be raised. For instance, in the following example, ``df.iloc[s.values, 1]`` is ok. - The boolean indexer is an array. But ``df.iloc[s, 1]`` would raise ``ValueError``. + While ``loc`` supports two kinds of boolean indexing, ``iloc`` only supports indexing with a + boolean array. If the indexer is a boolean ``Series``, an error will be raised. For instance, + in the following example, ``df.iloc[s.values, 1]`` is ok. The boolean indexer is an array. + But ``df.iloc[s, 1]`` would raise ``ValueError``. .. ipython:: python diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 7c165c87adb46..daf323acff129 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2340,6 +2340,7 @@ Read a URL with no options: .. code-block:: ipython In [320]: url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list" + In [321]: pd.read_html(url) Out[321]: [ Bank NameBank CityCity StateSt ... Acquiring InstitutionAI Closing DateClosing FundFund @@ -2366,6 +2367,7 @@ Read a URL while passing headers alongside the HTTP request: .. code-block:: ipython In [322]: url = 'https://www.sump.org/notes/request/' # HTTP request reflector + In [323]: pd.read_html(url) Out[323]: [ 0 1 @@ -2378,14 +2380,16 @@ Read a URL while passing headers alongside the HTTP request: 1 Host: www.sump.org 2 User-Agent: Python-urllib/3.8 3 Connection: close] + In [324]: headers = { - In [325]: 'User-Agent':'Mozilla Firefox v14.0', - In [326]: 'Accept':'application/json', - In [327]: 'Connection':'keep-alive', - In [328]: 'Auth':'Bearer 2*/f3+fe68df*4' - In [329]: } - In [340]: pd.read_html(url, storage_options=headers) - Out[340]: + .....: 'User-Agent':'Mozilla Firefox v14.0', + .....: 'Accept':'application/json', + .....: 'Connection':'keep-alive', + .....: 'Auth':'Bearer 2*/f3+fe68df*4' + .....: } + + In [325]: pd.read_html(url, storage_options=headers) + Out[325]: [ 0 1 0 Remote Socket: 51.15.105.256:51760 1 Protocol Version: HTTP/1.1 diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 3347f3a2534f4..8c5e98791a9ef 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -321,7 +321,7 @@ The missing value can be filled with a specific value with the ``fill_value`` ar .. image:: ../_static/reshaping_melt.png The top-level :func:`~pandas.melt` function and the corresponding :meth:`DataFrame.melt` -are useful to massage a :class:`DataFrame` into a format where one or more columns +are useful to reshape a :class:`DataFrame` into a format where one or more columns are *identifier variables*, while all other columns, considered *measured variables*, are "unpivoted" to the row axis, leaving just two non-identifier columns, "variable" and "value". The names of those columns can be customized diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 66eeb74b363a3..4b5cdca23103c 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1210,11 +1210,6 @@ You may set the ``xlabel`` and ``ylabel`` arguments to give the plot custom labe for x and y axis. By default, pandas will pick up index name as xlabel, while leaving it empty for ylabel. -.. ipython:: python - :suppress: - - plt.figure(); - .. ipython:: python df.plot(); diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index e25c4c2441920..0581951d5bfad 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -567,9 +567,9 @@ One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass \alpha = \begin{cases} - \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ - \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ - 1 - \exp^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 + \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ + \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ + 1 - e^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 \end{cases} One must specify precisely one of **span**, **center of mass**, **half-life** diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 5d72fabedcee8..8bdddb5b7f85d 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -32,8 +32,13 @@ enhancement1 Other enhancements ^^^^^^^^^^^^^^^^^^ +- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called + when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been + updated to work correctly with NumPy >= 2 (:issue:`57739`) +- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) +- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) -- .. --------------------------------------------------------------------------- .. _whatsnew_230.notable_bug_fixes: @@ -92,7 +97,7 @@ Timezones Numeric ^^^^^^^ -- +- Enabled :class:`Series.mode` and :class:`DataFrame.mode` with ``dropna=False`` to sort the result for all dtypes in the presence of NA values; previously only certain dtypes would sort (:issue:`60702`) - Conversion @@ -102,11 +107,12 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`) - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) +- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) -- Interval ^^^^^^^^ @@ -115,7 +121,7 @@ Interval Indexing ^^^^^^^^ -- +- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) - Missing @@ -130,7 +136,7 @@ MultiIndex I/O ^^^ -- +- :meth:`DataFrame.to_excel` was storing decimals as strings instead of numbers (:issue:`49598`) - Period @@ -170,8 +176,8 @@ Styler Other ^^^^^ -- -- +- Fixed usage of ``inspect`` when the optional dependencies ``pyarrow`` or ``jinja2`` + are not installed (:issue:`60196`) .. --------------------------------------------------------------------------- .. _whatsnew_230.contributors: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c61b8f3fb3701..9089b9cdd2185 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -30,6 +30,9 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) +- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) +- Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) +- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) - :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`) @@ -44,6 +47,7 @@ Other enhancements - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :class:`Rolling` and :class:`Expanding` now support ``pipe`` method (:issue:`57076`) - :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`) - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) @@ -51,13 +55,20 @@ Other enhancements - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) +- :meth:`DataFrame.to_json` now encodes ``Decimal`` as strings instead of floats (:issue:`60698`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`) +- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) +- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) +- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) +- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) +- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) -- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) +- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) @@ -481,7 +492,7 @@ Other Removals - Enforced deprecation of :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for object-dtype (:issue:`57820`) - Enforced deprecation of :meth:`offsets.Tick.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`) - Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`) -- Enforced deprecation of ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock`` (:issue:`58467`) +- Enforced deprecation of ``core.internals`` member ``DatetimeTZBlock`` (:issue:`58467`) - Enforced deprecation of ``date_parser`` in :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_excel` in favour of ``date_format`` (:issue:`50601`) - Enforced deprecation of ``keep_date_col`` keyword in :func:`read_csv` (:issue:`55569`) - Enforced deprecation of ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead. (:issue:`52550`) @@ -552,6 +563,7 @@ Other Removals - Removed the ``method`` keyword in ``ExtensionArray.fillna``, implement ``ExtensionArray._pad_or_backfill`` instead (:issue:`53621`) - Removed the attribute ``dtypes`` from :class:`.DataFrameGroupBy` (:issue:`51997`) - Enforced deprecation of ``argmin``, ``argmax``, ``idxmin``, and ``idxmax`` returning a result when ``skipna=False`` and an NA value is encountered or all values are NA values; these operations will now raise in such cases (:issue:`33941`, :issue:`51276`) +- Removed specifying ``include_groups=True`` in :class:`.DataFrameGroupBy.apply` and :class:`.Resampler.apply` (:issue:`7155`) .. --------------------------------------------------------------------------- .. _whatsnew_300.performance: @@ -613,6 +625,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`) +- Bug in :class:`DataFrame` raising ``ValueError`` when ``dtype`` is ``timedelta64`` and ``data`` is a list containing ``None`` (:issue:`60064`) - Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`) - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`) @@ -624,7 +637,9 @@ Datetimelike - Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`) - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) - Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`) +- Bug in :meth:`to_datetime` on float32 df with year, month, day etc. columns leads to precision issues and incorrect result. (:issue:`60506`) - Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`) +- Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta @@ -664,7 +679,8 @@ Indexing ^^^^^^^^ - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) -- +- Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`) +- Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`) Missing ^^^^^^^ @@ -677,6 +693,7 @@ MultiIndex - :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`) - :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`) - :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) +- Bug in :class:`DataFrame` arithmetic operations in case of unaligned MultiIndex columns (:issue:`60498`) - I/O @@ -687,6 +704,7 @@ I/O - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) +- Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) @@ -696,11 +714,14 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) +- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) - Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) - Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) +- Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) +- Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) Period ^^^^^^ @@ -710,6 +731,7 @@ Period Plotting ^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`) +- Bug in :meth:`DataFrame.plot.bar` with ``stacked=True`` where labels on stacked bars with zero-height segments were incorrectly positioned at the base instead of the label position of the previous segment (:issue:`59429`) - Bug in :meth:`DataFrame.plot.line` raising ``ValueError`` when set both color and a ``dict`` style (:issue:`59461`) - Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`) - Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`) @@ -724,21 +746,25 @@ Groupby/resample/rolling - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) - Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) +- Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) - Bug in :meth:`DataFrameGroupBy.cumsum` and :meth:`DataFrameGroupBy.cumprod` where ``numeric_only`` parameter was passed indirectly through kwargs instead of passing directly. (:issue:`58811`) - Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) +- Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) - Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`) Reshaping ^^^^^^^^^ - Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`) +- Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`) - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) +- Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`) - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) @@ -753,11 +779,12 @@ ExtensionArray - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) - Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`) +- Bug in constructing pandas data structures when passing into ``dtype`` a string of the type followed by ``[pyarrow]`` while PyArrow is not installed would raise ``NameError`` rather than ``ImportError`` (:issue:`57928`) - Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`) Styler ^^^^^^ -- +- Bug in :meth:`Styler.to_latex` where styling column headers when combined with a hidden index or hidden index-levels is fixed. Other ^^^^^ @@ -770,17 +797,27 @@ Other - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) +- Bug in :meth:`DataFrame.query` where using duplicate column names led to a ``TypeError``. (:issue:`59950`) - Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`) +- Bug in :meth:`DataFrame.shift` where passing a ``freq`` on a DataFrame with no columns did not shift the index correctly. (:issue:`60102`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) +- Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` throwing ``ValueError`` when ``regex=True`` and all NA values. (:issue:`60688`) +- Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`) +- Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) +- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`) +- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) +- Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) +- Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) .. ***DO NOT USE THIS SECTION*** diff --git a/environment.yml b/environment.yml index 5ef5fbe910427..69647a436e3ad 100644 --- a/environment.yml +++ b/environment.yml @@ -35,6 +35,7 @@ dependencies: - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - ipython + - pickleshare # Needed for IPython Sphinx directive in the docs GH#60429 - jinja2>=3.1.2 - lxml>=4.9.2 - matplotlib>=3.6.3 @@ -77,7 +78,7 @@ dependencies: # code checks - flake8=7.1.0 # run in subprocess over docstring examples - - mypy=1.11.2 # pre-commit uses locally installed mypy + - mypy=1.13.0 # pre-commit uses locally installed mypy - tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py - pre-commit>=4.0.1 @@ -87,7 +88,7 @@ dependencies: - google-auth - natsort # DataFrame.sort_values doctest - numpydoc - - pydata-sphinx-theme=0.14 + - pydata-sphinx-theme=0.16 - pytest-cython # doctest - sphinx - sphinx-design diff --git a/meson.build b/meson.build index efe543b7a267c..66583095a6e77 100644 --- a/meson.build +++ b/meson.build @@ -1,15 +1,13 @@ # This file is adapted from https://github.com/scipy/scipy/blob/main/meson.build project( 'pandas', - 'c', 'cpp', 'cython', + 'c', + 'cpp', + 'cython', version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(), license: 'BSD-3', meson_version: '>=1.2.1', - default_options: [ - 'buildtype=release', - 'c_std=c11', - 'warning_level=2', - ] + default_options: ['buildtype=release', 'c_std=c11', 'warning_level=2'], ) fs = import('fs') @@ -18,41 +16,40 @@ tempita = files('generate_pxi.py') versioneer = files('generate_version.py') -add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language : 'c') -add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language : 'cpp') +add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language: 'c') +add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language: 'cpp') # Allow supporting older numpys than the version compiled against # Set the define to the min supported version of numpy for pandas # e.g. right now this is targeting numpy 1.21+ -add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'c') -add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'cpp') +add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language: 'c') +add_project_arguments( + '-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', + language: 'cpp', +) if fs.exists('_version_meson.py') py.install_sources('_version_meson.py', subdir: 'pandas') else - custom_target('write_version_file', + custom_target( + 'write_version_file', output: '_version_meson.py', - command: [ - py, versioneer, '-o', '@OUTPUT@' - ], + command: [py, versioneer, '-o', '@OUTPUT@'], build_by_default: true, build_always_stale: true, install: true, - install_dir: py.get_install_dir() / 'pandas' + install_dir: py.get_install_dir() / 'pandas', ) meson.add_dist_script(py, versioneer, '-o', '_version_meson.py') endif cy = meson.get_compiler('cython') if cy.version().version_compare('>=3.1.0') - add_project_arguments('-Xfreethreading_compatible=true', language : 'cython') + add_project_arguments('-Xfreethreading_compatible=true', language: 'cython') endif # Needed by pandas.test() when it looks for the pytest ini options -py.install_sources( - 'pyproject.toml', - subdir: 'pandas' -) +py.install_sources('pyproject.toml', subdir: 'pandas') subdir('pandas') diff --git a/pandas/__init__.py b/pandas/__init__.py index 6c97baa890777..c570fb8d70204 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -235,6 +235,7 @@ # Pandas is not (yet) a py.typed library: the public API is determined # based on the documentation. __all__ = [ + "NA", "ArrowDtype", "BooleanDtype", "Categorical", @@ -253,15 +254,14 @@ "HDFStore", "Index", "IndexSlice", + "Int8Dtype", "Int16Dtype", "Int32Dtype", "Int64Dtype", - "Int8Dtype", "Interval", "IntervalDtype", "IntervalIndex", "MultiIndex", - "NA", "NaT", "NamedAgg", "Period", @@ -274,10 +274,10 @@ "Timedelta", "TimedeltaIndex", "Timestamp", + "UInt8Dtype", "UInt16Dtype", "UInt32Dtype", "UInt64Dtype", - "UInt8Dtype", "api", "array", "arrays", @@ -290,8 +290,8 @@ "errors", "eval", "factorize", - "get_dummies", "from_dummies", + "get_dummies", "get_option", "infer_freq", "interval_range", diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 80d9ea1b364f3..463e8af7cc561 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -8,13 +8,13 @@ __all__ = [ "config", + "describe_option", "detect_console_encoding", "get_option", - "set_option", - "reset_option", - "describe_option", "option_context", "options", + "reset_option", + "set_option", ] from pandas._config import config from pandas._config import dates # pyright: ignore[reportUnusedImport] # noqa: F401 diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 1d57aa806e0f1..0d06e6fa8e96c 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -188,6 +188,11 @@ def set_option(*args) -> None: """ Set the value of the specified option or options. + This method allows fine-grained control over the behavior and display settings + of pandas. Options affect various functionalities such as output formatting, + display limits, and operational behavior. Settings can be modified at runtime + without requiring changes to global configurations or environment variables. + Parameters ---------- *args : str | object @@ -316,6 +321,11 @@ def reset_option(pat: str) -> None: """ Reset one or more options to their default value. + This method resets the specified pandas option(s) back to their default + values. It allows partial string matching for convenience, but users should + exercise caution to avoid unintended resets due to changes in option names + in future versions. + Parameters ---------- pat : str/regex diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py index 26a872a90e493..d499f9a6cd75e 100644 --- a/pandas/_libs/__init__.py +++ b/pandas/_libs/__init__.py @@ -1,4 +1,5 @@ __all__ = [ + "Interval", "NaT", "NaTType", "OutOfBoundsDatetime", @@ -6,7 +7,6 @@ "Timedelta", "Timestamp", "iNaT", - "Interval", ] diff --git a/pandas/_libs/byteswap.pyx b/pandas/_libs/byteswap.pyx index 67cd7ad58d229..7a8a9fc5a9139 100644 --- a/pandas/_libs/byteswap.pyx +++ b/pandas/_libs/byteswap.pyx @@ -15,7 +15,7 @@ from libc.string cimport memcpy def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint32_t value - assert offset + sizeof(value) < len(data) + assert offset + sizeof(value) < len(data) cdef const void *ptr = (data) + offset memcpy(&value, ptr, sizeof(value)) if byteswap: @@ -28,7 +28,7 @@ def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint64_t value - assert offset + sizeof(value) < len(data) + assert offset + sizeof(value) < len(data) cdef const void *ptr = (data) + offset memcpy(&value, ptr, sizeof(value)) if byteswap: @@ -41,7 +41,7 @@ def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint16_t res - assert offset + sizeof(res) < len(data) + assert offset + sizeof(res) < len(data) memcpy(&res, (data) + offset, sizeof(res)) if byteswap: res = _byteswap2(res) @@ -50,7 +50,7 @@ def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint32_t res - assert offset + sizeof(res) < len(data) + assert offset + sizeof(res) < len(data) memcpy(&res, (data) + offset, sizeof(res)) if byteswap: res = _byteswap4(res) @@ -59,7 +59,7 @@ def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint64_t res - assert offset + sizeof(res) < len(data) + assert offset + sizeof(res) < len(data) memcpy(&res, (data) + offset, sizeof(res)) if byteswap: res = _byteswap8(res) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 53f5f73624232..163fc23535022 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -13,6 +13,7 @@ def group_median_float64( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., # bint + skipna: bool = ..., ) -> None: ... def group_cumprod( out: np.ndarray, # float64_t[:, ::1] @@ -66,6 +67,7 @@ def group_sum( result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_prod( out: np.ndarray, # int64float_t[:, ::1] @@ -75,6 +77,7 @@ def group_prod( mask: np.ndarray | None, result_mask: np.ndarray | None = ..., min_count: int = ..., + skipna: bool = ..., ) -> None: ... def group_var( out: np.ndarray, # floating[:, ::1] @@ -87,6 +90,7 @@ def group_var( result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., name: str = ..., + skipna: bool = ..., ) -> None: ... def group_skew( out: np.ndarray, # float64_t[:, ::1] @@ -97,6 +101,15 @@ def group_skew( result_mask: np.ndarray | None = ..., skipna: bool = ..., ) -> None: ... +def group_kurt( + out: np.ndarray, # float64_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[float64_T, ndim=2] + labels: np.ndarray, # const intp_t[::1] + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., + skipna: bool = ..., +) -> None: ... def group_mean( out: np.ndarray, # floating[:, ::1] counts: np.ndarray, # int64_t[::1] @@ -106,6 +119,7 @@ def group_mean( is_datetimelike: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_ohlc( out: np.ndarray, # floatingintuint_t[:, ::1] @@ -172,6 +186,7 @@ def group_max( is_datetimelike: bool = ..., mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_min( out: np.ndarray, # groupby_t[:, ::1] @@ -182,6 +197,7 @@ def group_min( is_datetimelike: bool = ..., mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_idxmin_idxmax( out: npt.NDArray[np.intp], diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d7e485f74e58b..16a104a46ed3d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -62,7 +62,12 @@ cdef enum InterpolationEnumType: INTERPOLATION_MIDPOINT -cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept nogil: +cdef float64_t median_linear_mask( + float64_t* a, + int n, + uint8_t* mask, + bint skipna=True +) noexcept nogil: cdef: int i, j, na_count = 0 float64_t* tmp @@ -77,7 +82,7 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n na_count += 1 if na_count: - if na_count == n: + if na_count == n or not skipna: return NaN tmp = malloc((n - na_count) * sizeof(float64_t)) @@ -104,7 +109,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n cdef float64_t median_linear( float64_t* a, int n, - bint is_datetimelike=False + bint is_datetimelike=False, + bint skipna=True, ) noexcept nogil: cdef: int i, j, na_count = 0 @@ -125,7 +131,7 @@ cdef float64_t median_linear( na_count += 1 if na_count: - if na_count == n: + if na_count == n or not skipna: return NaN tmp = malloc((n - na_count) * sizeof(float64_t)) @@ -186,6 +192,7 @@ def group_median_float64( const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -229,7 +236,7 @@ def group_median_float64( for j in range(ngroups): size = _counts[j + 1] - result = median_linear_mask(ptr, size, ptr_mask) + result = median_linear_mask(ptr, size, ptr_mask, skipna) out[j, i] = result if result != result: @@ -244,7 +251,7 @@ def group_median_float64( ptr += _counts[0] for j in range(ngroups): size = _counts[j + 1] - out[j, i] = median_linear(ptr, size, is_datetimelike) + out[j, i] = median_linear(ptr, size, is_datetimelike, skipna) ptr += size @@ -700,18 +707,19 @@ def group_sum( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 using Kahan summation """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - sum_t val, t, y + sum_t val, t, y, nan_val sum_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) bint uses_mask = mask is not None - bint isna_entry + bint isna_entry, isna_result if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -722,6 +730,15 @@ def group_sum( compensation = np.zeros((out).shape, dtype=(out).base.dtype) N, K = (values).shape + if uses_mask: + nan_val = 0 + elif is_datetimelike: + nan_val = NPY_NAT + elif sum_t is int64_t or sum_t is uint64_t: + # This has no effect as int64 can't be nan. Setting to 0 to avoid type error + nan_val = 0 + else: + nan_val = NAN with nogil(sum_t is not object): for i in range(N): @@ -736,8 +753,16 @@ def group_sum( if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] else: isna_entry = _treat_as_na(val, is_datetimelike) + isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) + + if not skipna and isna_result: + # If sum is already NA, don't add to it. This is important for + # datetimelikebecause adding a value to NPY_NAT may not result + # in a NPY_NAT + continue if not isna_entry: nobs[lab, j] += 1 @@ -765,6 +790,11 @@ def group_sum( # because of no gil compensation[lab, j] = 0 sumx[lab, j] = t + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + sumx[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx @@ -781,17 +811,18 @@ def group_prod( const uint8_t[:, ::1] mask, uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64float_t val + int64float_t val, nan_val int64float_t[:, ::1] prodx int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -800,6 +831,7 @@ def group_prod( prodx = np.ones((out).shape, dtype=(out).base.dtype) N, K = (values).shape + nan_val = _get_na_val(0, False) with nogil: for i in range(N): @@ -813,12 +845,23 @@ def group_prod( if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] else: isna_entry = _treat_as_na(val, False) + isna_result = _treat_as_na(prodx[lab, j], False) + + if not skipna and isna_result: + # If prod is already NA, no need to update it + continue if not isna_entry: nobs[lab, j] += 1 prodx[lab, j] *= val + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + prodx[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx @@ -839,6 +882,7 @@ def group_var( uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, str name="var", + bint skipna=True, ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) @@ -846,7 +890,7 @@ def group_var( floating[:, ::1] mean int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None bint is_std = name == "std" bint is_sem = name == "sem" @@ -875,19 +919,34 @@ def group_var( if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] elif is_datetimelike: # With group_var, we cannot just use _treat_as_na bc # datetimelike dtypes get cast to float64 instead of # to int64. isna_entry = val == NPY_NAT + isna_result = out[lab, j] == NPY_NAT else: isna_entry = _treat_as_na(val, is_datetimelike) + isna_result = _treat_as_na(out[lab, j], is_datetimelike) + + if not skipna and isna_result: + # If aggregate is already NA, don't add to it. This is important for + # datetimelike because adding a value to NPY_NAT may not result + # in a NPY_NAT + continue if not isna_entry: nobs[lab, j] += 1 oldmean = mean[lab, j] mean[lab, j] += (val - oldmean) / nobs[lab, j] out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + elif not skipna: + nobs[lab, j] = 0 + if uses_mask: + result_mask[lab, j] = True + else: + out[lab, j] = NAN for i in range(ncounts): for j in range(K): @@ -910,7 +969,7 @@ def group_var( @cython.wraparound(False) @cython.boundscheck(False) @cython.cdivision(True) -@cython.cpow +@cython.cpow(True) def group_skew( float64_t[:, ::1] out, int64_t[::1] counts, @@ -961,7 +1020,7 @@ def group_skew( isna_entry = _treat_as_na(val, False) if not isna_entry: - # Based on RunningStats::Push from + # Running stats update based on RunningStats::Push from # https://www.johndcook.com/blog/skewness_kurtosis/ n1 = nobs[lab, j] n = n1 + 1 @@ -995,6 +1054,100 @@ def group_skew( ) +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(True) +@cython.cpow(True) +def group_kurt( + float64_t[:, ::1] out, + int64_t[::1] counts, + ndarray[float64_t, ndim=2] values, + const intp_t[::1] labels, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, + bint skipna=True, +) -> None: + cdef: + Py_ssize_t i, j, N, K, lab, ngroups = len(counts) + int64_t[:, ::1] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) + bint isna_entry, uses_mask = mask is not None + float64_t[:, ::1] M1, M2, M3, M4 + float64_t delta, delta_n, delta_n2, term1, val + int64_t n1, n + float64_t ct, num, den, adj + + if len_values != len_labels: + raise ValueError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + + # M1, M2, M3 and M4 correspond to 1st, 2nd, 3rd and 4th Moments + M1 = np.zeros((out).shape, dtype=np.float64) + M2 = np.zeros((out).shape, dtype=np.float64) + M3 = np.zeros((out).shape, dtype=np.float64) + M4 = np.zeros((out).shape, dtype=np.float64) + + N, K = (values).shape + + out[:, :] = 0.0 + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, False) + + if not isna_entry: + # Running stats update based on RunningStats::Push from + # https://www.johndcook.com/blog/skewness_kurtosis/ + n1 = nobs[lab, j] + n = n1 + 1 + + nobs[lab, j] = n + delta = val - M1[lab, j] + delta_n = delta / n + delta_n2 = delta_n * delta_n + term1 = delta * delta_n * n1 + + M1[lab, j] += delta_n + M4[lab, j] += (term1 * delta_n2 * (n*n - 3*n + 3) + + 6 * delta_n2 * M2[lab, j] + - 4 * delta_n * M3[lab, j]) + M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j] + M2[lab, j] += term1 + elif not skipna: + M1[lab, j] = NaN + M2[lab, j] = NaN + M3[lab, j] = NaN + M4[lab, j] = NaN + + for i in range(ngroups): + for j in range(K): + ct = nobs[i, j] + if ct < 4: + if result_mask is not None: + result_mask[i, j] = 1 + out[i, j] = NaN + elif M2[i, j] == 0: + out[i, j] = 0 + else: + num = ct * (ct + 1) * (ct - 1) * M4[i, j] + den = (ct - 2) * (ct - 3) * M2[i, j] ** 2 + adj = 3.0 * (ct - 1) ** 2 / ((ct - 2) * (ct - 3)) + out[i, j] = num / den - adj + + @cython.wraparound(False) @cython.boundscheck(False) def group_mean( @@ -1006,6 +1159,7 @@ def group_mean( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """ Compute the mean per label given a label assignment for each value. @@ -1031,6 +1185,8 @@ def group_mean( Mask of the input values. result_mask : ndarray[bool, ndim=2], optional Mask of the out array + skipna : bool, optional + If True, ignore nans in `values`. Notes ----- @@ -1044,7 +1200,7 @@ def group_mean( mean_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None assert min_count == -1, "'min_count' only used in sum and prod" @@ -1076,13 +1232,22 @@ def group_mean( if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] elif is_datetimelike: # With group_mean, we cannot just use _treat_as_na bc # datetimelike dtypes get cast to float64 instead of # to int64. isna_entry = val == NPY_NAT + isna_result = sumx[lab, j] == NPY_NAT else: isna_entry = _treat_as_na(val, is_datetimelike) + isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) + + if not skipna and isna_result: + # If sum is already NA, don't add to it. This is important for + # datetimelike because adding a value to NPY_NAT may not result + # in NPY_NAT + continue if not isna_entry: nobs[lab, j] += 1 @@ -1097,6 +1262,14 @@ def group_mean( # because of no gil compensation[lab, j] = 0. sumx[lab, j] = t + elif not skipna: + # Set the nobs to 0 so that in case of datetimelike, + # dividing NPY_NAT by nobs may not result in a NPY_NAT + nobs[lab, j] = 0 + if uses_mask: + result_mask[lab, j] = True + else: + sumx[lab, j] = nan_val for i in range(ncounts): for j in range(K): @@ -1668,6 +1841,7 @@ cdef group_min_max( bint compute_max=True, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1695,6 +1869,8 @@ cdef group_min_max( result_mask : ndarray[bool, ndim=2], optional If not None, these specify locations in the output that are NA. Modified in-place. + skipna : bool, default True + If True, ignore nans in `values`. Notes ----- @@ -1703,17 +1879,18 @@ cdef group_min_max( """ cdef: Py_ssize_t i, j, N, K, lab, ngroups = len(counts) - numeric_t val + numeric_t val, nan_val numeric_t[:, ::1] group_min_or_max int64_t[:, ::1] nobs bint uses_mask = mask is not None - bint isna_entry + bint isna_entry, isna_result if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) + nan_val = _get_na_val(0, is_datetimelike) group_min_or_max = np.empty_like(out) group_min_or_max[:] = _get_min_or_max(0, compute_max, is_datetimelike) @@ -1732,8 +1909,15 @@ cdef group_min_max( if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] else: isna_entry = _treat_as_na(val, is_datetimelike) + isna_result = _treat_as_na(group_min_or_max[lab, j], + is_datetimelike) + + if not skipna and isna_result: + # If current min/max is already NA, it will always be NA + continue if not isna_entry: nobs[lab, j] += 1 @@ -1743,6 +1927,11 @@ cdef group_min_max( else: if val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + group_min_or_max[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max @@ -1874,6 +2063,7 @@ def group_max( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """See group_min_max.__doc__""" group_min_max( @@ -1886,6 +2076,7 @@ def group_max( compute_max=True, mask=mask, result_mask=result_mask, + skipna=skipna, ) @@ -1900,6 +2091,7 @@ def group_min( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """See group_min_max.__doc__""" group_min_max( @@ -1912,6 +2104,7 @@ def group_min( compute_max=False, mask=mask, result_mask=result_mask, + skipna=skipna, ) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 9706a8211b61f..04e2d1e39ce2a 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -34,11 +34,9 @@ static void *traced_calloc(size_t num, size_t size) { } static void *traced_realloc(void *old_ptr, size_t size) { + PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr); void *ptr = realloc(old_ptr, size); if (ptr != NULL) { - if (old_ptr != ptr) { - PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr); - } PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); } return ptr; diff --git a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h index 0d62bb0ba915c..51fdbc50bba57 100644 --- a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h +++ b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h @@ -170,8 +170,8 @@ typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc); typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc); typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc); typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc); -typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, - size_t *outLen); +typedef const char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, + size_t *outLen); typedef void *(*JSPFN_MALLOC)(size_t size); typedef void (*JSPFN_FREE)(void *pptr); typedef void *(*JSPFN_REALLOC)(void *base, size_t size); diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index bf6d8ba8973d3..3af2856d2fbbf 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -72,6 +72,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ... class MaskedUInt8Engine(MaskedIndexEngine): ... class MaskedBoolEngine(MaskedUInt8Engine): ... +class StringObjectEngine(ObjectEngine): + def __init__(self, values: object, na_value) -> None: ... + class BaseMultiIndexCodesEngine: levels: list[np.ndarray] offsets: np.ndarray # np.ndarray[..., ndim=1] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 1506a76aa94a6..c219d0b63870f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -557,6 +557,23 @@ cdef class StringEngine(IndexEngine): raise KeyError(val) return str(val) +cdef class StringObjectEngine(ObjectEngine): + + cdef: + object na_value + + def __init__(self, ndarray values, na_value): + super().__init__(values) + self.na_value = na_value + + cdef _check_type(self, object val): + if isinstance(val, str): + return val + elif checknull(val): + return self.na_value + else: + raise KeyError(val) + cdef class DatetimeEngine(Int64Engine): diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index de603beff7836..5239aa2c61dc5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1882,7 +1882,7 @@ cdef class BoolValidator(Validator): cpdef bint is_bool_array(ndarray values, bint skipna=False): cdef: - BoolValidator validator = BoolValidator(len(values), + BoolValidator validator = BoolValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1900,7 +1900,7 @@ cdef class IntegerValidator(Validator): # Note: only python-exposed for tests cpdef bint is_integer_array(ndarray values, bint skipna=True): cdef: - IntegerValidator validator = IntegerValidator(len(values), + IntegerValidator validator = IntegerValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1915,7 +1915,7 @@ cdef class IntegerNaValidator(Validator): cdef bint is_integer_na_array(ndarray values, bint skipna=True): cdef: - IntegerNaValidator validator = IntegerNaValidator(len(values), + IntegerNaValidator validator = IntegerNaValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1931,7 +1931,7 @@ cdef class IntegerFloatValidator(Validator): cdef bint is_integer_float_array(ndarray values, bint skipna=True): cdef: - IntegerFloatValidator validator = IntegerFloatValidator(len(values), + IntegerFloatValidator validator = IntegerFloatValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1949,7 +1949,7 @@ cdef class FloatValidator(Validator): # Note: only python-exposed for tests cpdef bint is_float_array(ndarray values): cdef: - FloatValidator validator = FloatValidator(len(values), values.dtype) + FloatValidator validator = FloatValidator(values.size, values.dtype) return validator.validate(values) @@ -1967,7 +1967,7 @@ cdef class ComplexValidator(Validator): cdef bint is_complex_array(ndarray values): cdef: - ComplexValidator validator = ComplexValidator(len(values), values.dtype) + ComplexValidator validator = ComplexValidator(values.size, values.dtype) return validator.validate(values) @@ -1980,7 +1980,7 @@ cdef class DecimalValidator(Validator): cdef bint is_decimal_array(ndarray values, bint skipna=False): cdef: DecimalValidator validator = DecimalValidator( - len(values), values.dtype, skipna=skipna + values.size, values.dtype, skipna=skipna ) return validator.validate(values) @@ -1996,7 +1996,7 @@ cdef class StringValidator(Validator): cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: - StringValidator validator = StringValidator(len(values), + StringValidator validator = StringValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -2013,7 +2013,7 @@ cdef class BytesValidator(Validator): cdef bint is_bytes_array(ndarray values, bint skipna=False): cdef: - BytesValidator validator = BytesValidator(len(values), values.dtype, + BytesValidator validator = BytesValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -2064,7 +2064,7 @@ cdef class DatetimeValidator(TemporalValidator): cpdef bint is_datetime_array(ndarray values, bint skipna=True): cdef: - DatetimeValidator validator = DatetimeValidator(len(values), + DatetimeValidator validator = DatetimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2078,7 +2078,7 @@ cdef class Datetime64Validator(DatetimeValidator): # Note: only python-exposed for tests cpdef bint is_datetime64_array(ndarray values, bint skipna=True): cdef: - Datetime64Validator validator = Datetime64Validator(len(values), + Datetime64Validator validator = Datetime64Validator(values.size, skipna=skipna) return validator.validate(values) @@ -2093,7 +2093,7 @@ cdef class AnyDatetimeValidator(DatetimeValidator): cdef bint is_datetime_or_datetime64_array(ndarray values, bint skipna=True): cdef: - AnyDatetimeValidator validator = AnyDatetimeValidator(len(values), + AnyDatetimeValidator validator = AnyDatetimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2105,7 +2105,7 @@ def is_datetime_with_singletz_array(values: ndarray) -> bool: Doesn't check values are datetime-like types. """ cdef: - Py_ssize_t i = 0, j, n = len(values) + Py_ssize_t i = 0, j, n = values.size object base_val, base_tz, val, tz if n == 0: @@ -2153,7 +2153,7 @@ cpdef bint is_timedelta_or_timedelta64_array(ndarray values, bint skipna=True): Infer with timedeltas and/or nat/none. """ cdef: - AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values), + AnyTimedeltaValidator validator = AnyTimedeltaValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2167,7 +2167,7 @@ cdef class DateValidator(Validator): # Note: only python-exposed for tests cpdef bint is_date_array(ndarray values, bint skipna=False): cdef: - DateValidator validator = DateValidator(len(values), skipna=skipna) + DateValidator validator = DateValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2180,7 +2180,7 @@ cdef class TimeValidator(Validator): # Note: only python-exposed for tests cpdef bint is_time_array(ndarray values, bint skipna=False): cdef: - TimeValidator validator = TimeValidator(len(values), skipna=skipna) + TimeValidator validator = TimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2231,14 +2231,14 @@ cpdef bint is_interval_array(ndarray values): Is this an ndarray of Interval (or np.nan) with a single dtype? """ cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, n = values.size str closed = None bint numeric = False bint dt64 = False bint td64 = False object val - if len(values) == 0: + if n == 0: return False for i in range(n): diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index c27386743c6e9..5fb6f1118d648 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -1,94 +1,119 @@ -_algos_take_helper = custom_target('algos_take_helper_pxi', +_algos_take_helper = custom_target( + 'algos_take_helper_pxi', output: 'algos_take_helper.pxi', input: 'algos_take_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_algos_common_helper = custom_target('algos_common_helper_pxi', +_algos_common_helper = custom_target( + 'algos_common_helper_pxi', output: 'algos_common_helper.pxi', input: 'algos_common_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_khash_primitive_helper = custom_target('khash_primitive_helper_pxi', +_khash_primitive_helper = custom_target( + 'khash_primitive_helper_pxi', output: 'khash_for_primitive_helper.pxi', input: 'khash_for_primitive_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_hashtable_class_helper = custom_target('hashtable_class_helper_pxi', +_hashtable_class_helper = custom_target( + 'hashtable_class_helper_pxi', output: 'hashtable_class_helper.pxi', input: 'hashtable_class_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_hashtable_func_helper = custom_target('hashtable_func_helper_pxi', +_hashtable_func_helper = custom_target( + 'hashtable_func_helper_pxi', output: 'hashtable_func_helper.pxi', input: 'hashtable_func_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_index_class_helper = custom_target('index_class_helper_pxi', +_index_class_helper = custom_target( + 'index_class_helper_pxi', output: 'index_class_helper.pxi', input: 'index_class_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_sparse_op_helper = custom_target('sparse_op_helper_pxi', +_sparse_op_helper = custom_target( + 'sparse_op_helper_pxi', output: 'sparse_op_helper.pxi', input: 'sparse_op_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_intervaltree_helper = custom_target('intervaltree_helper_pxi', +_intervaltree_helper = custom_target( + 'intervaltree_helper_pxi', output: 'intervaltree.pxi', input: 'intervaltree.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], +) +_khash_primitive_helper_dep = declare_dependency( + sources: _khash_primitive_helper, ) -_khash_primitive_helper_dep = declare_dependency(sources: _khash_primitive_helper) subdir('tslibs') libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included - 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep}, + 'algos': { + 'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], + 'deps': _khash_primitive_helper_dep, + }, 'arrays': {'sources': ['arrays.pyx']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, - 'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep}, - 'index': {'sources': ['index.pyx', _index_class_helper], 'deps': _khash_primitive_helper_dep}, + 'hashtable': { + 'sources': [ + 'hashtable.pyx', + _hashtable_class_helper, + _hashtable_func_helper, + ], + 'deps': _khash_primitive_helper_dep, + }, + 'index': { + 'sources': ['index.pyx', _index_class_helper], + 'deps': _khash_primitive_helper_dep, + }, 'indexing': {'sources': ['indexing.pyx']}, 'internals': {'sources': ['internals.pyx']}, - 'interval': {'sources': ['interval.pyx', _intervaltree_helper], - 'deps': _khash_primitive_helper_dep}, - 'join': {'sources': ['join.pyx', _khash_primitive_helper], - 'deps': _khash_primitive_helper_dep}, + 'interval': { + 'sources': ['interval.pyx', _intervaltree_helper], + 'deps': _khash_primitive_helper_dep, + }, + 'join': { + 'sources': ['join.pyx', _khash_primitive_helper], + 'deps': _khash_primitive_helper_dep, + }, 'lib': {'sources': ['lib.pyx', 'src/parser/tokenizer.c']}, 'missing': {'sources': ['missing.pyx']}, - 'pandas_datetime': {'sources': ['src/vendored/numpy/datetime/np_datetime.c', - 'src/vendored/numpy/datetime/np_datetime_strings.c', - 'src/datetime/date_conversions.c', - 'src/datetime/pd_datetime.c']}, - 'pandas_parser': {'sources': ['src/parser/tokenizer.c', - 'src/parser/io.c', - 'src/parser/pd_parser.c']}, - 'parsers': {'sources': ['parsers.pyx', 'src/parser/tokenizer.c', 'src/parser/io.c'], - 'deps': _khash_primitive_helper_dep}, - 'json': {'sources': ['src/vendored/ujson/python/ujson.c', - 'src/vendored/ujson/python/objToJSON.c', - 'src/vendored/ujson/python/JSONtoObj.c', - 'src/vendored/ujson/lib/ultrajsonenc.c', - 'src/vendored/ujson/lib/ultrajsondec.c']}, + 'pandas_datetime': { + 'sources': [ + 'src/vendored/numpy/datetime/np_datetime.c', + 'src/vendored/numpy/datetime/np_datetime_strings.c', + 'src/datetime/date_conversions.c', + 'src/datetime/pd_datetime.c', + ], + }, + 'pandas_parser': { + 'sources': [ + 'src/parser/tokenizer.c', + 'src/parser/io.c', + 'src/parser/pd_parser.c', + ], + }, + 'parsers': { + 'sources': ['parsers.pyx', 'src/parser/tokenizer.c', 'src/parser/io.c'], + 'deps': _khash_primitive_helper_dep, + }, + 'json': { + 'sources': [ + 'src/vendored/ujson/python/ujson.c', + 'src/vendored/ujson/python/objToJSON.c', + 'src/vendored/ujson/python/JSONtoObj.c', + 'src/vendored/ujson/lib/ultrajsonenc.c', + 'src/vendored/ujson/lib/ultrajsondec.c', + ], + }, 'ops': {'sources': ['ops.pyx']}, 'ops_dispatch': {'sources': ['ops_dispatch.pyx']}, 'properties': {'sources': ['properties.pyx']}, @@ -98,13 +123,13 @@ libs_sources = { 'sparse': {'sources': ['sparse.pyx', _sparse_op_helper]}, 'tslib': {'sources': ['tslib.pyx']}, 'testing': {'sources': ['testing.pyx']}, - 'writers': {'sources': ['writers.pyx']} + 'writers': {'sources': ['writers.pyx']}, } cython_args = [ '--include-dir', meson.current_build_dir(), - '-X always_allow_keywords=true' + '-X always_allow_keywords=true', ] if get_option('buildtype') == 'debug' cython_args += ['--gdb'] @@ -118,7 +143,7 @@ foreach ext_name, ext_dict : libs_sources include_directories: [inc_np, inc_pd], dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs', - install: true + install: true, ) endforeach @@ -148,14 +173,11 @@ sources_to_install = [ 'sparse.pyi', 'testing.pyi', 'tslib.pyi', - 'writers.pyi' + 'writers.pyi', ] -foreach source: sources_to_install - py.install_sources( - source, - subdir: 'pandas/_libs' - ) +foreach source : sources_to_install + py.install_sources(source, subdir: 'pandas/_libs') endforeach subdir('window') diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index c9f7a796a9b1c..61e96fc835e4d 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -148,7 +148,7 @@ int parser_init(parser_t *self) { self->warn_msg = NULL; // token stream - self->stream = malloc(STREAM_INIT_SIZE * sizeof(char)); + self->stream = malloc(STREAM_INIT_SIZE); if (self->stream == NULL) { parser_cleanup(self); return PARSER_OUT_OF_MEMORY; @@ -221,9 +221,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { char *orig_ptr = (void *)self->stream; TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) - self->stream = - (char *)grow_buffer((void *)self->stream, self->stream_len, - &self->stream_cap, nbytes * 2, sizeof(char), &status); + self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, + &self->stream_cap, nbytes * 2, 1, &status); TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, " "self->stream_cap=%zu, status=%zu\n", self->stream, self->stream_len, self->stream_cap, status)) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index cc65f34d6b6fe..9a022095feee9 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -660,11 +660,12 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); - out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); - out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); - out->us = (npy_int32)extract_unit(&dt, 1000LL); - out->ps = (npy_int32)(dt * 1000); + out->hour = + (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); + out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000); + out->ps = (npy_int32)(dt); break; case NPY_FR_fs: diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index c8d8b5ab6bd6e..1564ecb64b01d 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -920,7 +920,7 @@ Perhaps implement recursion detection */ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) { const char *value; - char *objName; + const char *objName; int count; JSOBJ iterObj; size_t szlen; diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 5f35860c59cb7..8342dbcd1763d 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -53,8 +53,8 @@ Numeric decoder derived from TCL library npy_int64 get_nat(void) { return NPY_MIN_INT64; } -typedef char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, - size_t *_outLen); +typedef const char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, + size_t *_outLen); int object_is_decimal_type(PyObject *obj); int object_is_dataframe_type(PyObject *obj); @@ -106,7 +106,7 @@ typedef struct __TypeContext { double doubleValue; JSINT64 longValue; - char *cStr; + const char *cStr; NpyArrContext *npyarr; PdBlockContext *pdblock; int transpose; @@ -301,14 +301,15 @@ static npy_float64 total_seconds(PyObject *td) { return double_val; } -static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), - size_t *_outLen) { +static const char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), + size_t *_outLen) { PyObject *obj = (PyObject *)_obj; *_outLen = PyBytes_GET_SIZE(obj); return PyBytes_AS_STRING(obj); } -static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) { +static const char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, + size_t *_outLen) { char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); if (encoded == NULL) { /* Something went wrong. @@ -321,8 +322,8 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) { } /* JSON callback. returns a char* and mutates the pointer to *len */ -static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), - JSONTypeContext *tc, size_t *len) { +static const char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); @@ -330,15 +331,15 @@ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), } /* JSON callback. returns a char* and mutates the pointer to *len */ -static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), - JSONTypeContext *tc, size_t *len) { +static const char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { GET_TC(tc)->cStr = int64ToIsoDuration(GET_TC(tc)->longValue, len); return GET_TC(tc)->cStr; } /* JSON callback */ -static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, - size_t *len) { +static const char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, + size_t *len) { if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) { PyErr_SetString(PyExc_TypeError, "Expected date or datetime object"); ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; @@ -349,7 +350,8 @@ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, return PyDateTimeToIso(obj, base, len); } -static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { +static const char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, + size_t *outLen) { PyObject *obj = (PyObject *)_obj; PyObject *str = PyObject_CallMethod(obj, "isoformat", NULL); if (str == NULL) { @@ -373,6 +375,27 @@ static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { return outValue; } +static const char *PyDecimalToUTF8Callback(JSOBJ _obj, JSONTypeContext *tc, + size_t *len) { + PyObject *obj = (PyObject *)_obj; + PyObject *format_spec = PyUnicode_FromStringAndSize("f", 1); + PyObject *str = PyObject_Format(obj, format_spec); + Py_DECREF(format_spec); + + if (str == NULL) { + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + return NULL; + } + + GET_TC(tc)->newObj = str; + + Py_ssize_t s_len; + char *outValue = (char *)PyUnicode_AsUTF8AndSize(str, &s_len); + *len = s_len; + + return outValue; +} + //============================================================================= // Numpy array iteration functions //============================================================================= @@ -537,10 +560,10 @@ static JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; @@ -588,11 +611,11 @@ static int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { return NpyArr_iterNextItem(obj, tc); } -static char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { const npy_intp idx = blkCtxt->colIdx - 1; @@ -610,12 +633,12 @@ static char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, return cStr; } -static char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), - JSONTypeContext *tc, - size_t *outLen) { +static const char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, + size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; @@ -796,9 +819,9 @@ static JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -843,9 +866,9 @@ static JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Set_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *Set_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -941,8 +964,8 @@ static JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); } @@ -973,9 +996,9 @@ static JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *List_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *List_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -984,24 +1007,16 @@ static char *List_iterGetName(JSOBJ Py_UNUSED(obj), //============================================================================= static void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->cStr = "name"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->cStr = "data"; GET_TC(tc)->itemValue = get_values(obj); if (!GET_TC(tc)->itemValue) { return 0; @@ -1021,8 +1036,8 @@ static JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1033,28 +1048,20 @@ static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, static void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); enc->outputFormat = VALUES; // for contained series - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->cStr = "name"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->cStr = "index"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->cStr = "data"; GET_TC(tc)->itemValue = get_values(obj); if (!GET_TC(tc)->itemValue) { return 0; @@ -1076,8 +1083,8 @@ static JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1088,28 +1095,20 @@ static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, static void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); enc->outputFormat = VALUES; // for contained series & index - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); + GET_TC(tc)->cStr = "columns"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->cStr = "index"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->cStr = "data"; Py_INCREF(obj); GET_TC(tc)->itemValue = obj; } else { @@ -1129,8 +1128,8 @@ static JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1180,8 +1179,8 @@ static JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); } @@ -1467,8 +1466,18 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_UTF8; return; } else if (object_is_decimal_type(obj)) { - pc->doubleValue = PyFloat_AsDouble(obj); - tc->type = JT_DOUBLE; + PyObject *is_nan_py = PyObject_RichCompare(obj, obj, Py_NE); + if (is_nan_py == NULL) { + goto INVALID; + } + int is_nan = (is_nan_py == Py_True); + Py_DECREF(is_nan_py); + if (is_nan) { + tc->type = JT_NULL; + return; + } + pc->PyTypeToUTF8 = PyDecimalToUTF8Callback; + tc->type = JT_UTF8; return; } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { if (object_is_nat_type(obj)) { @@ -1871,7 +1880,6 @@ static void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->rowLabels = NULL; NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); GET_TC(tc)->columnLabels = NULL; - PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; PyObject_Free(tc->prv); tc->prv = NULL; @@ -1922,8 +1930,8 @@ static JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { return GET_TC(tc)->iterGetValue(obj, tc); } -static char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, - size_t *outLen) { +static const char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, + size_t *outLen) { return GET_TC(tc)->iterGetName(obj, tc, outLen); } diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 31979b293a940..f433a3acf356f 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -1,39 +1,39 @@ __all__ = [ - "dtypes", - "localize_pydatetime", + "BaseOffset", + "IncompatibleFrequency", "NaT", "NaTType", - "iNaT", - "nat_strings", "OutOfBoundsDatetime", "OutOfBoundsTimedelta", - "IncompatibleFrequency", "Period", "Resolution", + "Tick", "Timedelta", - "normalize_i8_timestamps", - "is_date_array_normalized", - "dt64arr_to_periodarr", + "Timestamp", + "add_overflowsafe", + "astype_overflowsafe", "delta_to_nanoseconds", + "dt64arr_to_periodarr", + "dtypes", + "get_resolution", + "get_supported_dtype", + "get_unit_from_dtype", + "guess_datetime_format", + "iNaT", "ints_to_pydatetime", "ints_to_pytimedelta", - "get_resolution", - "Timestamp", - "tz_convert_from_utc_single", - "tz_convert_from_utc", - "to_offset", - "Tick", - "BaseOffset", - "tz_compare", + "is_date_array_normalized", + "is_supported_dtype", "is_unitless", - "astype_overflowsafe", - "get_unit_from_dtype", + "localize_pydatetime", + "nat_strings", + "normalize_i8_timestamps", "periods_per_day", "periods_per_second", - "guess_datetime_format", - "add_overflowsafe", - "get_supported_dtype", - "is_supported_dtype", + "to_offset", + "tz_compare", + "tz_convert_from_utc", + "tz_convert_from_utc_single", ] from pandas._libs.tslibs import dtypes diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index a635dd33f8420..7a8b4df447aee 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -149,18 +149,18 @@ def cast_from_unit_vectorized( if p: frac = np.round(frac, p) - try: - for i in range(len(values)): + for i in range(len(values)): + try: if base[i] == NPY_NAT: out[i] = NPY_NAT else: out[i] = (base[i] * m) + (frac[i] * m) - except (OverflowError, FloatingPointError) as err: - # FloatingPointError can be issued if we have float dtype and have - # set np.errstate(over="raise") - raise OutOfBoundsDatetime( - f"cannot convert input {values[i]} with the unit '{unit}'" - ) from err + except (OverflowError, FloatingPointError) as err: + # FloatingPointError can be issued if we have float dtype and have + # set np.errstate(over="raise") + raise OutOfBoundsDatetime( + f"cannot convert input {values[i]} with the unit '{unit}'" + ) from err return out diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index 85410f771233f..052a8568b76af 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -22,7 +22,7 @@ tslibs_sources = { cython_args = [ '--include-dir', meson.current_build_dir(), - '-X always_allow_keywords=true' + '-X always_allow_keywords=true', ] if get_option('buildtype') == 'debug' cython_args += ['--gdb'] @@ -36,7 +36,7 @@ foreach ext_name, ext_dict : tslibs_sources include_directories: [inc_np, inc_pd], dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs/tslibs', - install: true + install: true, ) endforeach @@ -56,12 +56,9 @@ sources_to_install = [ 'timestamps.pyi', 'timezones.pyi', 'tzconversion.pyi', - 'vectorized.pyi' + 'vectorized.pyi', ] -foreach source: sources_to_install - py.install_sources( - source, - subdir: 'pandas/_libs/tslibs' - ) +foreach source : sources_to_install + py.install_sources(source, subdir: 'pandas/_libs/tslibs') endforeach diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 1c0a99eb1ea25..2657b1b9d197b 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -704,7 +704,7 @@ class NaTType(_NaT): difference between the current timezone and UTC. Returns - -------- + ------- timedelta The difference between UTC and the local time as a `timedelta` object. diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 193556b2697a9..1b7f04fe17238 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -201,6 +201,10 @@ class OutOfBoundsTimedelta(ValueError): Representation should be within a timedelta64[ns]. + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + Examples -------- >>> pd.date_range(start="1/1/1700", freq="B", periods=100000) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 7569f8e8864a0..36b431974c121 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -720,11 +720,24 @@ cdef class BaseOffset: """ Return boolean whether a timestamp intersects with this frequency. + This method determines if a given timestamp aligns with the start + of a custom business month, as defined by this offset. It accounts + for custom rules, such as skipping weekends or other non-business days, + and checks whether the provided datetime falls on a valid business day + that marks the beginning of the custom business month. + Parameters ---------- dt : datetime.datetime Timestamp to check intersections with frequency. + See Also + -------- + tseries.offsets.CustomBusinessMonthBegin : Represents the start of a custom + business month. + tseries.offsets.CustomBusinessMonthEnd : Represents the end of a custom + business month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) @@ -3710,6 +3723,15 @@ cdef class LastWeekOfMonth(WeekOfMonthMixin): - 5 is Saturday - 6 is Sunday. + See Also + -------- + tseries.offsets.WeekOfMonth : + Date offset for a specific weekday in a month. + tseries.offsets.MonthEnd : + Date offset for the end of the month. + tseries.offsets.BMonthEnd : + Date offset for the last business day of the month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c563ab91c4142..f697180da5eeb 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -114,6 +114,7 @@ from pandas._libs.tslibs.offsets import ( INVALID_FREQ_ERR_MSG, BDay, ) +from pandas.util._decorators import set_module cdef: enum: @@ -678,7 +679,7 @@ cdef char* c_strftime(npy_datetimestruct *dts, char *fmt): c_date.tm_yday = get_day_of_year(dts.year, dts.month, dts.day) - 1 c_date.tm_isdst = -1 - result = malloc(result_len * sizeof(char)) + result = malloc(result_len) if result is NULL: raise MemoryError() @@ -2830,6 +2831,7 @@ cdef class _Period(PeriodMixin): return period_format(self.ordinal, base, fmt) +@set_module("pandas") class Period(_Period): """ Represents a period of time. diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 15b629624bafc..e320aca04683c 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1,6 +1,7 @@ import collections import warnings +from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level cimport cython @@ -1854,7 +1855,7 @@ cdef class _Timedelta(timedelta): # Python front end to C extension type _Timedelta # This serves as the box for timedelta64 - +@set_module("pandas") class Timedelta(_Timedelta): """ Represents a duration, the difference between two dates or times. @@ -1916,7 +1917,7 @@ class Timedelta(_Timedelta): -------- Here we initialize Timedelta object with both value and unit - >>> td = pd.Timedelta(1, "d") + >>> td = pd.Timedelta(1, "D") >>> td Timedelta('1 days 00:00:00') diff --git a/pandas/_libs/tslibs/timestamps.pxd b/pandas/_libs/tslibs/timestamps.pxd index bd73c713f6c04..c5ec92fabc7f8 100644 --- a/pandas/_libs/tslibs/timestamps.pxd +++ b/pandas/_libs/tslibs/timestamps.pxd @@ -21,7 +21,7 @@ cdef _Timestamp create_timestamp_from_ts(int64_t value, cdef class _Timestamp(ABCTimestamp): cdef readonly: - int64_t _value, nanosecond, year + int64_t _value, _nanosecond, _year NPY_DATETIMEUNIT _creso cdef bint _get_start_end_field(self, str field, freq) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 34c84d396ad64..6b4b90167e625 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -50,6 +50,7 @@ import datetime as dt from pandas._libs.tslibs cimport ccalendar from pandas._libs.tslibs.base cimport ABCTimestamp +from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level from pandas._libs.tslibs.conversion cimport ( @@ -161,8 +162,8 @@ cdef _Timestamp create_timestamp_from_ts( dts.sec, dts.us, tz, fold=fold) ts_base._value = value - ts_base.year = dts.year - ts_base.nanosecond = dts.ps // 1000 + ts_base._year = dts.year + ts_base._nanosecond = dts.ps // 1000 ts_base._creso = reso return ts_base @@ -355,9 +356,9 @@ cdef class _Timestamp(ABCTimestamp): # ----------------------------------------------------------------- def __hash__(_Timestamp self): - if self.nanosecond: + if self._nanosecond: return hash(self._value) - if not (1 <= self.year <= 9999): + if not (1 <= self._year <= 9999): # out of bounds for pydatetime return hash(self._value) if self.fold: @@ -375,7 +376,7 @@ cdef class _Timestamp(ABCTimestamp): elif cnp.is_datetime64_object(other): ots = Timestamp(other) elif PyDateTime_Check(other): - if self.nanosecond == 0: + if self._nanosecond == 0: val = self.to_pydatetime() return PyObject_RichCompareBool(val, other, op) @@ -454,7 +455,7 @@ cdef class _Timestamp(ABCTimestamp): if not self._can_compare(other): return NotImplemented - if self.nanosecond == 0: + if self._nanosecond == 0: return PyObject_RichCompareBool(dtval, other, op) # otherwise we have dtval < self @@ -463,9 +464,9 @@ cdef class _Timestamp(ABCTimestamp): if op == Py_EQ: return False if op == Py_LE or op == Py_LT: - return self.year <= other.year + return self._year <= other.year if op == Py_GE or op == Py_GT: - return self.year >= other.year + return self._year >= other.year cdef bint _can_compare(self, datetime other): if self.tzinfo is not None: @@ -606,7 +607,7 @@ cdef class _Timestamp(ABCTimestamp): if own_tz is not None and not is_utc(own_tz): pydatetime_to_dtstruct(self, &dts) - val = npy_datetimestruct_to_datetime(self._creso, &dts) + self.nanosecond + val = npy_datetimestruct_to_datetime(self._creso, &dts) + self._nanosecond else: val = self._value return val @@ -898,7 +899,7 @@ cdef class _Timestamp(ABCTimestamp): >>> ts.is_leap_year True """ - return bool(ccalendar.is_leapyear(self.year)) + return bool(ccalendar.is_leapyear(self._year)) @property def day_of_week(self) -> int: @@ -942,7 +943,7 @@ cdef class _Timestamp(ABCTimestamp): >>> ts.day_of_year 74 """ - return ccalendar.get_day_of_year(self.year, self.month, self.day) + return ccalendar.get_day_of_year(self._year, self.month, self.day) @property def quarter(self) -> int: @@ -1029,6 +1030,29 @@ cdef class _Timestamp(ABCTimestamp): """ return super().fold + @property + def year(self) -> int: + """ + Return the year of the Timestamp. + + Returns + ------- + int + The year of the Timestamp. + + See Also + -------- + Timestamp.month : Return the month of the Timestamp. + Timestamp.day : Return the day of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30") + >>> ts.year + 2024 + """ + return self._year + @property def month(self) -> int: """ @@ -1144,6 +1168,29 @@ cdef class _Timestamp(ABCTimestamp): """ return super().microsecond + @property + def nanosecond(self) -> int: + """ + Return the nanosecond of the Timestamp. + + Returns + ------- + int + The nanosecond of the Timestamp. + + See Also + -------- + Timestamp.second : Return the second of the Timestamp. + Timestamp.microsecond : Return the microsecond of the Timestamp. + + Examples + -------- + >>> ts = pd.Timestamp("2024-08-31 16:16:30.230400015") + >>> ts.nanosecond + 15 + """ + return self._nanosecond + @property def week(self) -> int: """ @@ -1164,7 +1211,7 @@ cdef class _Timestamp(ABCTimestamp): >>> ts.week 11 """ - return ccalendar.get_week_of_year(self.year, self.month, self.day) + return ccalendar.get_week_of_year(self._year, self.month, self.day) @property def days_in_month(self) -> int: @@ -1186,7 +1233,7 @@ cdef class _Timestamp(ABCTimestamp): >>> ts.days_in_month 31 """ - return ccalendar.get_days_in_month(self.year, self.month) + return ccalendar.get_days_in_month(self._year, self.month) # ----------------------------------------------------------------- # Transformation Methods @@ -1260,7 +1307,7 @@ cdef class _Timestamp(ABCTimestamp): The full format looks like 'YYYY-MM-DD HH:MM:SS.mmmmmmnnn'. By default, the fractional part is omitted if self.microsecond == 0 - and self.nanosecond == 0. + and self._nanosecond == 0. If self.tzinfo is not None, the UTC offset is also attached, giving giving a full format of 'YYYY-MM-DD HH:MM:SS.mmmmmmnnn+HH:MM'. @@ -1296,9 +1343,9 @@ cdef class _Timestamp(ABCTimestamp): base_ts = "microseconds" if timespec == "nanoseconds" else timespec base = super(_Timestamp, self).isoformat(sep=sep, timespec=base_ts) # We need to replace the fake year 1970 with our real year - base = f"{self.year:04d}-" + base.split("-", 1)[1] + base = f"{self._year:04d}-" + base.split("-", 1)[1] - if self.nanosecond == 0 and timespec != "nanoseconds": + if self._nanosecond == 0 and timespec != "nanoseconds": return base if self.tzinfo is not None: @@ -1306,11 +1353,11 @@ cdef class _Timestamp(ABCTimestamp): else: base1, base2 = base, "" - if timespec == "nanoseconds" or (timespec == "auto" and self.nanosecond): + if timespec == "nanoseconds" or (timespec == "auto" and self._nanosecond): if self.microsecond or timespec == "nanoseconds": - base1 += f"{self.nanosecond:03d}" + base1 += f"{self._nanosecond:03d}" else: - base1 += f".{self.nanosecond:09d}" + base1 += f".{self._nanosecond:09d}" return base1 + base2 @@ -1344,14 +1391,14 @@ cdef class _Timestamp(ABCTimestamp): def _date_repr(self) -> str: # Ideal here would be self.strftime("%Y-%m-%d"), but # the datetime strftime() methods require year >= 1900 and is slower - return f"{self.year}-{self.month:02d}-{self.day:02d}" + return f"{self._year}-{self.month:02d}-{self.day:02d}" @property def _time_repr(self) -> str: result = f"{self.hour:02d}:{self.minute:02d}:{self.second:02d}" - if self.nanosecond != 0: - result += f".{self.nanosecond + 1000 * self.microsecond:09d}" + if self._nanosecond != 0: + result += f".{self._nanosecond + 1000 * self.microsecond:09d}" elif self.microsecond != 0: result += f".{self.microsecond:06d}" @@ -1515,11 +1562,11 @@ cdef class _Timestamp(ABCTimestamp): >>> pd.NaT.to_pydatetime() NaT """ - if self.nanosecond != 0 and warn: + if self._nanosecond != 0 and warn: warnings.warn("Discarding nonzero nanoseconds in conversion.", UserWarning, stacklevel=find_stack_level()) - return datetime(self.year, self.month, self.day, + return datetime(self._year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond, self.tzinfo, fold=self.fold) @@ -1648,7 +1695,7 @@ cdef class _Timestamp(ABCTimestamp): # Python front end to C extension type _Timestamp # This serves as the box for datetime64 - +@set_module("pandas") class Timestamp(_Timestamp): """ Pandas replacement for python datetime.datetime object. @@ -1998,7 +2045,7 @@ class Timestamp(_Timestamp): '2020-03-14 15:32:52' """ try: - _dt = datetime(self.year, self.month, self.day, + _dt = datetime(self._year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond, self.tzinfo, fold=self.fold) except ValueError as err: @@ -2041,7 +2088,7 @@ class Timestamp(_Timestamp): 'Sun Jan 1 10:00:00 2023' """ try: - _dt = datetime(self.year, self.month, self.day, + _dt = datetime(self._year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond, self.tzinfo, fold=self.fold) except ValueError as err: @@ -2081,7 +2128,7 @@ class Timestamp(_Timestamp): datetime.date(2023, 1, 1) """ try: - _dt = dt.date(self.year, self.month, self.day) + _dt = dt.date(self._year, self.month, self.day) except ValueError as err: raise NotImplementedError( "date not yet supported on Timestamps which " @@ -2130,7 +2177,7 @@ class Timestamp(_Timestamp): datetime.IsoCalendarDate(year=2022, week=52, weekday=7) """ try: - _dt = datetime(self.year, self.month, self.day, + _dt = datetime(self._year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond, self.tzinfo, fold=self.fold) except ValueError as err: @@ -2170,7 +2217,7 @@ class Timestamp(_Timestamp): difference between the current timezone and UTC. Returns - -------- + ------- timedelta The difference between UTC and the local time as a `timedelta` object. @@ -2272,7 +2319,7 @@ class Timestamp(_Timestamp): tm_hour=10, tm_min=0, tm_sec=0, tm_wday=6, tm_yday=1, tm_isdst=-1) """ try: - _dt = datetime(self.year, self.month, self.day, + _dt = datetime(self._year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond, self.tzinfo, fold=self.fold) except ValueError as err: @@ -2333,7 +2380,7 @@ class Timestamp(_Timestamp): 738521 """ try: - _dt = datetime(self.year, self.month, self.day, + _dt = datetime(self._year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond, self.tzinfo, fold=self.fold) except ValueError as err: @@ -2926,7 +2973,7 @@ timedelta}, default 'raise' -------- >>> ts = pd.Timestamp(1584226800, unit='s', tz='Europe/Stockholm') >>> ts.tz - + zoneinfo.ZoneInfo(key='Europe/Stockholm') """ return self.tzinfo @@ -3222,7 +3269,7 @@ default 'raise' # setup components pandas_datetime_to_datetimestruct(value, self._creso, &dts) - dts.ps = self.nanosecond * 1000 + dts.ps = self._nanosecond * 1000 # replace def validate(k, v): @@ -3312,7 +3359,7 @@ default 'raise' >>> ts.to_julian_date() 2458923.147824074 """ - year = self.year + year = self._year month = self.month day = self.day if month <= 2: @@ -3329,7 +3376,7 @@ default 'raise' self.minute / 60.0 + self.second / 3600.0 + self.microsecond / 3600.0 / 1e+6 + - self.nanosecond / 3600.0 / 1e+9 + self._nanosecond / 3600.0 / 1e+9 ) / 24.0) def isoweekday(self): @@ -3380,7 +3427,7 @@ default 'raise' """ # same as super().weekday(), but that breaks because of how # we have overridden year, see note in create_timestamp_from_ts - return ccalendar.dayofweek(self.year, self.month, self.day) + return ccalendar.dayofweek(self._year, self.month, self.day) # Aliases diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index a6cfbec9b15b9..ee735761e3dc6 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -60,6 +60,18 @@ def roll_min( end: np.ndarray, # np.ndarray[np.int64] minp: int, # int64_t ) -> np.ndarray: ... # np.ndarray[float] +def roll_first( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] +def roll_last( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] def roll_quantile( values: np.ndarray, # const float64_t[:] start: np.ndarray, # np.ndarray[np.int64] diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 5b9ee095d4643..d33c840371d2a 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1133,6 +1133,89 @@ cdef _roll_min_max(ndarray[float64_t] values, return output +# ---------------------------------------------------------------------- +# Rolling first, last + + +def roll_first(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp) -> np.ndarray: + return _roll_first_last(values, start, end, minp, is_first=1) + + +def roll_last(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp) -> np.ndarray: + return _roll_first_last(values, start, end, minp, is_first=0) + + +cdef _roll_first_last(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, bint is_first): + cdef: + Py_ssize_t i, j, fl_idx + bint is_monotonic_increasing_bounds + int64_t nobs = 0, N = len(start), s, e + float64_t val, res + ndarray[float64_t] output + + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) + + output = np.empty(N, dtype=np.float64) + + if (end - start).max() == 0: + output[:] = NaN + return output + + with nogil: + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + fl_idx = -1 + nobs = 0 + for j in range(s, e): + val = values[j] + if val == val: + if not is_first or fl_idx < s: + fl_idx = j + nobs += 1 + else: + # handle deletes + for j in range(start[i - 1], s): + val = values[j] + if val == val: + nobs -= 1 + + # update fl_idx if out of range, if first + if is_first and fl_idx < s: + fl_idx = -1 + for j in range(s, end[i - 1]): + val = values[j] + if val == val: + fl_idx = j + break + + # handle adds + for j in range(end[i - 1], e): + val = values[j] + if val == val: + if not is_first or fl_idx < s: + fl_idx = j + nobs += 1 + + if nobs >= minp and fl_idx >= s: + res = values[fl_idx] + else: + res = NaN + + output[i] = res + + if not is_monotonic_increasing_bounds: + nobs = 0 + + return output + cdef enum InterpolationType: LINEAR, diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build index ad15644f73a0c..1d49bba47e139 100644 --- a/pandas/_libs/window/meson.build +++ b/pandas/_libs/window/meson.build @@ -4,8 +4,8 @@ py.extension_module( cython_args: ['-X always_allow_keywords=true'], include_directories: [inc_np, inc_pd], subdir: 'pandas/_libs/window', - override_options : ['cython_language=cpp'], - install: true + override_options: ['cython_language=cpp'], + install: true, ) py.extension_module( @@ -14,18 +14,11 @@ py.extension_module( cython_args: ['-X always_allow_keywords=true'], include_directories: [inc_np, inc_pd], subdir: 'pandas/_libs/window', - install: true + install: true, ) -sources_to_install = [ - '__init__.py', - 'aggregations.pyi', - 'indexers.pyi' -] +sources_to_install = ['__init__.py', 'aggregations.pyi', 'indexers.pyi'] -foreach source: sources_to_install - py.install_sources( - source, - subdir: 'pandas/_libs/window' - ) +foreach source : sources_to_install + py.install_sources(source, subdir: 'pandas/_libs/window') endforeach diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 0be01da1816a2..ec9b5098c97c9 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, ContextManager, - cast, ) import numpy as np @@ -21,8 +20,6 @@ from pandas.compat import pa_version_under10p1 -from pandas.core.dtypes.common import is_string_dtype - import pandas as pd from pandas import ( ArrowDtype, @@ -77,8 +74,8 @@ with_csv_dialect, ) from pandas.core.arrays import ( + ArrowExtensionArray, BaseMaskedArray, - ExtensionArray, NumpyExtensionArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray @@ -92,7 +89,6 @@ NpDtype, ) - from pandas.core.arrays import ArrowExtensionArray UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"] UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"] @@ -501,6 +497,8 @@ def shares_memory(left, right) -> bool: if isinstance(left, MultiIndex): return shares_memory(left._codes, right) if isinstance(left, (Index, Series)): + if isinstance(right, (Index, Series)): + return shares_memory(left._values, right._values) return shares_memory(left._values, right) if isinstance(left, NDArrayBackedExtensionArray): @@ -510,24 +508,18 @@ def shares_memory(left, right) -> bool: if isinstance(left, pd.core.arrays.IntervalArray): return shares_memory(left._left, right) or shares_memory(left._right, right) - if ( - isinstance(left, ExtensionArray) - and is_string_dtype(left.dtype) - and left.dtype.storage == "pyarrow" # type: ignore[attr-defined] - ): - # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 - left = cast("ArrowExtensionArray", left) - if ( - isinstance(right, ExtensionArray) - and is_string_dtype(right.dtype) - and right.dtype.storage == "pyarrow" # type: ignore[attr-defined] - ): - right = cast("ArrowExtensionArray", right) + if isinstance(left, ArrowExtensionArray): + if isinstance(right, ArrowExtensionArray): + # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 left_pa_data = left._pa_array right_pa_data = right._pa_array left_buf1 = left_pa_data.chunk(0).buffers()[1] right_buf1 = right_pa_data.chunk(0).buffers()[1] - return left_buf1 == right_buf1 + return left_buf1.address == right_buf1.address + else: + # if we have one one ArrowExtensionArray and one other array, assume + # they can only share memory if they share the same numpy buffer + return np.shares_memory(left, right) if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray): # By convention, we'll say these share memory if they share *either* @@ -548,6 +540,25 @@ def shares_memory(left, right) -> bool: "ALL_INT_NUMPY_DTYPES", "ALL_NUMPY_DTYPES", "ALL_REAL_NUMPY_DTYPES", + "BOOL_DTYPES", + "BYTES_DTYPES", + "COMPLEX_DTYPES", + "DATETIME64_DTYPES", + "ENDIAN", + "FLOAT_EA_DTYPES", + "FLOAT_NUMPY_DTYPES", + "NARROW_NP_DTYPES", + "NP_NAT_OBJECTS", + "NULL_OBJECTS", + "OBJECT_DTYPES", + "SIGNED_INT_EA_DTYPES", + "SIGNED_INT_NUMPY_DTYPES", + "STRING_DTYPES", + "TIMEDELTA64_DTYPES", + "UNSIGNED_INT_EA_DTYPES", + "UNSIGNED_INT_NUMPY_DTYPES", + "SubclassedDataFrame", + "SubclassedSeries", "assert_almost_equal", "assert_attr_equal", "assert_categorical_equal", @@ -571,51 +582,32 @@ def shares_memory(left, right) -> bool: "assert_sp_array_equal", "assert_timedelta_array_equal", "at", - "BOOL_DTYPES", "box_expected", - "BYTES_DTYPES", "can_set_locale", - "COMPLEX_DTYPES", "convert_rows_list_to_csv_str", - "DATETIME64_DTYPES", "decompress_file", - "ENDIAN", "ensure_clean", "external_error_raised", - "FLOAT_EA_DTYPES", - "FLOAT_NUMPY_DTYPES", "get_cython_table_params", "get_dtype", - "getitem", - "get_locales", "get_finest_unit", + "get_locales", "get_obj", "get_op_from_name", + "getitem", "iat", "iloc", "loc", "maybe_produces_warning", - "NARROW_NP_DTYPES", - "NP_NAT_OBJECTS", - "NULL_OBJECTS", - "OBJECT_DTYPES", "raise_assert_detail", "raises_chained_assignment_error", "round_trip_pathlib", "round_trip_pickle", - "setitem", "set_locale", "set_timezone", + "setitem", "shares_memory", - "SIGNED_INT_EA_DTYPES", - "SIGNED_INT_NUMPY_DTYPES", - "STRING_DTYPES", - "SubclassedDataFrame", - "SubclassedSeries", - "TIMEDELTA64_DTYPES", "to_array", - "UNSIGNED_INT_EA_DTYPES", - "UNSIGNED_INT_NUMPY_DTYPES", "with_csv_dialect", "write_to_compressed", ] diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 01c4dcd92ee40..daa5187cdb636 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -755,11 +755,8 @@ def assert_extension_array_equal( and atol is lib.no_default ): check_exact = ( - is_numeric_dtype(left.dtype) - and not is_float_dtype(left.dtype) - or is_numeric_dtype(right.dtype) - and not is_float_dtype(right.dtype) - ) + is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype) + ) or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) elif check_exact is lib.no_default: check_exact = False @@ -944,11 +941,8 @@ def assert_series_equal( and atol is lib.no_default ): check_exact = ( - is_numeric_dtype(left.dtype) - and not is_float_dtype(left.dtype) - or is_numeric_dtype(right.dtype) - and not is_float_dtype(right.dtype) - ) + is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype) + ) or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) left_index_dtypes = ( [left.index.dtype] if left.index.nlevels == 1 else left.index.dtypes ) diff --git a/pandas/_typing.py b/pandas/_typing.py index c1769126a5776..b515305fb6903 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -273,7 +273,7 @@ def mode(self) -> str: # for _get_filepath_or_buffer ... - def seek(self, __offset: int, __whence: int = ...) -> int: + def seek(self, offset: int, whence: int = ..., /) -> int: # with one argument: gzip.GzipFile, bz2.BZ2File # with two arguments: zip.ZipFile, read_sas ... @@ -288,13 +288,13 @@ def tell(self) -> int: class ReadBuffer(BaseBuffer, Protocol[AnyStr_co]): - def read(self, __n: int = ...) -> AnyStr_co: + def read(self, n: int = ..., /) -> AnyStr_co: # for BytesIOWrapper, gzip.GzipFile, bz2.BZ2File ... class WriteBuffer(BaseBuffer, Protocol[AnyStr_contra]): - def write(self, __b: AnyStr_contra) -> Any: + def write(self, b: AnyStr_contra, /) -> Any: # for gzip.GzipFile, bz2.BZ2File ... diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index 9b007e8fe8da4..8f659e3cd14c8 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -9,9 +9,9 @@ ) __all__ = [ - "interchange", "extensions", "indexers", + "interchange", "types", "typing", ] diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index ea5f1ba926899..1c88c0d35b4d7 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -21,13 +21,13 @@ ) __all__ = [ - "no_default", + "ExtensionArray", "ExtensionDtype", - "register_extension_dtype", + "ExtensionScalarOpsMixin", + "no_default", "register_dataframe_accessor", + "register_extension_dtype", "register_index_accessor", "register_series_accessor", "take", - "ExtensionArray", - "ExtensionScalarOpsMixin", ] diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py index 78357f11dc3b7..f3c6546218de4 100644 --- a/pandas/api/indexers/__init__.py +++ b/pandas/api/indexers/__init__.py @@ -10,8 +10,8 @@ ) __all__ = [ - "check_array_indexer", "BaseIndexer", "FixedForwardWindowIndexer", "VariableOffsetWindowIndexer", + "check_array_indexer", ] diff --git a/pandas/api/interchange/__init__.py b/pandas/api/interchange/__init__.py index 2f3a73bc46b31..aded37abc7224 100644 --- a/pandas/api/interchange/__init__.py +++ b/pandas/api/interchange/__init__.py @@ -5,4 +5,4 @@ from pandas.core.interchange.dataframe_protocol import DataFrame from pandas.core.interchange.from_dataframe import from_dataframe -__all__ = ["from_dataframe", "DataFrame"] +__all__ = ["DataFrame", "from_dataframe"] diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py index c601086bb9f86..4a5c742b1628b 100644 --- a/pandas/api/types/__init__.py +++ b/pandas/api/types/__init__.py @@ -14,10 +14,10 @@ ) __all__ = [ - "infer_dtype", - "union_categoricals", "CategoricalDtype", "DatetimeTZDtype", "IntervalDtype", "PeriodDtype", + "infer_dtype", + "union_categoricals", ] diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index c58fa0f085266..c1178c72f3edc 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -3,6 +3,7 @@ """ from pandas._libs import NaTType +from pandas._libs.lib import NoDefault from pandas._libs.missing import NAType from pandas.core.groupby import ( @@ -42,18 +43,17 @@ "ExponentialMovingWindowGroupby", "FrozenList", "JsonReader", - "NaTType", "NAType", + "NaTType", + "NoDefault", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", "RollingGroupby", + "SASReader", "SeriesGroupBy", "StataReader", - "SASReader", - # See TODO above - # "Styler", - "TimedeltaIndexResamplerGroupby", "TimeGrouper", + "TimedeltaIndexResamplerGroupby", "Window", ] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 756c209661fbb..138456f877c5f 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -34,6 +34,7 @@ pa_version_under16p0, pa_version_under17p0, pa_version_under18p0, + pa_version_under19p0, ) if TYPE_CHECKING: @@ -150,6 +151,13 @@ def is_ci_environment() -> bool: __all__ = [ + "HAS_PYARROW", + "IS64", + "ISMUSL", + "PY311", + "PY312", + "PYPY", + "WASM", "is_numpy_dev", "pa_version_under10p1", "pa_version_under11p0", @@ -159,11 +167,5 @@ def is_ci_environment() -> bool: "pa_version_under16p0", "pa_version_under17p0", "pa_version_under18p0", - "HAS_PYARROW", - "IS64", - "ISMUSL", - "PY311", - "PY312", - "PYPY", - "WASM", + "pa_version_under19p0", ] diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 2fab8f32b8e71..3306b36d71806 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -47,7 +47,7 @@ __all__ = [ - "np", "_np_version", "is_numpy_dev", + "np", ] diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index bd009b544f31e..c501c06b93813 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -18,6 +18,7 @@ pa_version_under16p0 = _palv < Version("16.0.0") pa_version_under17p0 = _palv < Version("17.0.0") pa_version_under18p0 = _palv < Version("18.0.0") + pa_version_under19p0 = _palv < Version("19.0.0") HAS_PYARROW = True except ImportError: pa_version_under10p1 = True @@ -30,4 +31,5 @@ pa_version_under16p0 = True pa_version_under17p0 = True pa_version_under18p0 = True + pa_version_under19p0 = True HAS_PYARROW = False diff --git a/pandas/conftest.py b/pandas/conftest.py index 7ad322d050c0f..f9c10a7758bd2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -600,7 +600,7 @@ def multiindex_year_month_day_dataframe_random_data(): """ tdf = DataFrame( np.random.default_rng(2).standard_normal((100, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100, freq="B"), ) ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() @@ -787,7 +787,7 @@ def string_series() -> Series: """ return Series( np.arange(30, dtype=np.float64) * 1.1, - index=Index([f"i_{i}" for i in range(30)], dtype=object), + index=Index([f"i_{i}" for i in range(30)]), name="series", ) @@ -798,7 +798,7 @@ def object_series() -> Series: Fixture for Series of dtype object with Index of unique strings """ data = [f"foo_{i}" for i in range(30)] - index = Index([f"bar_{i}" for i in range(30)], dtype=object) + index = Index([f"bar_{i}" for i in range(30)]) return Series(data, index=index, name="objects", dtype=object) @@ -890,8 +890,8 @@ def int_frame() -> DataFrame: """ return DataFrame( np.ones((30, 4), dtype=np.int64), - index=Index([f"foo_{i}" for i in range(30)], dtype=object), - columns=Index(list("ABCD"), dtype=object), + index=Index([f"foo_{i}" for i in range(30)]), + columns=Index(list("ABCD")), ) @@ -1317,6 +1317,22 @@ def nullable_string_dtype(request): return request.param +@pytest.fixture( + params=[ + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + ] +) +def pyarrow_string_dtype(request): + """ + Parametrized fixture for string dtypes backed by Pyarrow. + + * 'str[pyarrow]' + * 'string[pyarrow]' + """ + return pd.StringDtype(*request.param) + + @pytest.fixture( params=[ "python", diff --git a/pandas/core/_numba/kernels/__init__.py b/pandas/core/_numba/kernels/__init__.py index 1116c61c4ca8e..6983711480455 100644 --- a/pandas/core/_numba/kernels/__init__.py +++ b/pandas/core/_numba/kernels/__init__.py @@ -16,12 +16,12 @@ ) __all__ = [ - "sliding_mean", "grouped_mean", - "sliding_sum", + "grouped_min_max", "grouped_sum", - "sliding_var", "grouped_var", + "sliding_mean", "sliding_min_max", - "grouped_min_max", + "sliding_sum", + "sliding_var", ] diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py index cc10bd003af7e..2b59ea2fe12a5 100644 --- a/pandas/core/_numba/kernels/mean_.py +++ b/pandas/core/_numba/kernels/mean_.py @@ -169,9 +169,10 @@ def grouped_mean( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool, ) -> tuple[np.ndarray, list[int]]: output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py index 59d36732ebae6..d56453e4e5abf 100644 --- a/pandas/core/_numba/kernels/min_max_.py +++ b/pandas/core/_numba/kernels/min_max_.py @@ -88,6 +88,7 @@ def grouped_min_max( ngroups: int, min_periods: int, is_max: bool, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(labels) nobs = np.zeros(ngroups, dtype=np.int64) @@ -97,13 +98,16 @@ def grouped_min_max( for i in range(N): lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or (nobs[lab] >= 1 and np.isnan(output[lab])): continue if values.dtype.kind == "i" or not np.isnan(val): nobs[lab] += 1 else: - # NaN value cannot be a min/max value + if not skipna: + # If skipna is False and we encounter a NaN, + # both min and max of the group will be NaN + output[lab] = np.nan continue if nobs[lab] == 1: diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index 76f4e22b43c4b..9f2e9541b31d0 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -165,6 +165,7 @@ def grouped_kahan_sum( result_dtype: np.dtype, labels: npt.NDArray[np.intp], ngroups: int, + skipna: bool, ) -> tuple[ np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray ]: @@ -180,7 +181,15 @@ def grouped_kahan_sum( lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or np.isnan(output[lab]): + continue + + if not skipna and np.isnan(val): + output[lab] = np.nan + nobs_arr[lab] += 1 + comp_arr[lab] = np.nan + consecutive_counts[lab] = 1 + prev_vals[lab] = np.nan continue sum_x = output[lab] @@ -219,11 +228,12 @@ def grouped_sum( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool, ) -> tuple[np.ndarray, list[int]]: na_pos = [] output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index 69aec4d6522c4..5d720c877815d 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -176,6 +176,7 @@ def grouped_var( ngroups: int, min_periods: int, ddof: int = 1, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(labels) @@ -190,7 +191,11 @@ def grouped_var( lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or np.isnan(output[lab]): + continue + + if not skipna and np.isnan(val): + output[lab] = np.nan continue mean_x = means[lab] diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 56f8adda93251..aafd802b827a5 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -23,6 +23,7 @@ iNaT, lib, ) +from pandas._libs.missing import NA from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -544,10 +545,15 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: # Ensure np.isin doesn't get object types or it *may* throw an exception # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), # isin is faster for small sizes + + # GH60678 + # Ensure values don't contain , otherwise it throws exception with np.in1d + if ( len(comps_array) > _MINIMUM_COMP_ARR_LEN and len(values) <= 26 and comps_array.dtype != object + and not any(v is NA for v in values) ): # If the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan @@ -1012,7 +1018,7 @@ def mode( return npresult, res_mask # type: ignore[return-value] try: - npresult = np.sort(npresult) + npresult = safe_sort(npresult) except TypeError as err: warnings.warn( f"Unable to sort modes: {err}", diff --git a/pandas/core/api.py b/pandas/core/api.py index c8a4e9d8a23b2..ec12d543d8389 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -80,59 +80,59 @@ from pandas.core.frame import DataFrame # isort:skip __all__ = [ - "array", + "NA", "ArrowDtype", - "bdate_range", "BooleanDtype", "Categorical", "CategoricalDtype", "CategoricalIndex", "DataFrame", "DateOffset", - "date_range", "DatetimeIndex", "DatetimeTZDtype", - "factorize", "Flags", "Float32Dtype", "Float64Dtype", "Grouper", "Index", "IndexSlice", + "Int8Dtype", "Int16Dtype", "Int32Dtype", "Int64Dtype", - "Int8Dtype", "Interval", "IntervalDtype", "IntervalIndex", - "interval_range", - "isna", - "isnull", "MultiIndex", - "NA", - "NamedAgg", "NaT", - "notna", - "notnull", + "NamedAgg", "Period", "PeriodDtype", "PeriodIndex", - "period_range", "RangeIndex", "Series", - "set_eng_float_format", "StringDtype", "Timedelta", "TimedeltaIndex", - "timedelta_range", "Timestamp", - "to_datetime", - "to_numeric", - "to_timedelta", + "UInt8Dtype", "UInt16Dtype", "UInt32Dtype", "UInt64Dtype", - "UInt8Dtype", + "array", + "bdate_range", + "date_range", + "factorize", + "interval_range", + "isna", + "isnull", + "notna", + "notnull", + "period_range", + "set_eng_float_format", + "timedelta_range", + "to_datetime", + "to_numeric", + "to_timedelta", "unique", ] diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index f946c5adcbb0b..debd6368e98a4 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -89,7 +89,8 @@ def _check_comparison_types( op = np.vectorize( lambda x: bool(re.search(b, x)) if isinstance(x, str) and isinstance(b, (str, Pattern)) - else False + else False, + otypes=[bool], ) # GH#32621 use mask to avoid comparing to NAs @@ -151,4 +152,6 @@ def re_replacer(s): if mask is None: values[:] = f(values) else: + if values.ndim != mask.ndim: + mask = np.broadcast_to(mask, values.shape) values[mask] = f(values[mask]) diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 245a171fea74b..f183e9236471e 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -23,21 +23,21 @@ __all__ = [ "ArrowExtensionArray", - "ExtensionArray", - "ExtensionOpsMixin", - "ExtensionScalarOpsMixin", "ArrowStringArray", "BaseMaskedArray", "BooleanArray", "Categorical", "DatetimeArray", + "ExtensionArray", + "ExtensionOpsMixin", + "ExtensionScalarOpsMixin", "FloatingArray", "IntegerArray", "IntervalArray", "NumpyExtensionArray", "PeriodArray", - "period_array", "SparseArray", "StringArray", "TimedeltaArray", + "period_array", ] diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 2d1b1eca55e98..1ca52ce64bd77 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -253,6 +253,10 @@ def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) return self._convert_bool_result(result) + def _str_isascii(self): + result = pc.string_is_ascii(self._pa_array) + return self._convert_bool_result(result) + def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) return self._convert_bool_result(result) diff --git a/pandas/core/arrays/arrow/__init__.py b/pandas/core/arrays/arrow/__init__.py index 5fc50f786fc6a..50274a2de2cc1 100644 --- a/pandas/core/arrays/arrow/__init__.py +++ b/pandas/core/arrays/arrow/__init__.py @@ -4,4 +4,4 @@ ) from pandas.core.arrays.arrow.array import ArrowExtensionArray -__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"] +__all__ = ["ArrowExtensionArray", "ListAccessor", "StructAccessor"] diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index d9a80b699b0bb..b220a94d032b5 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -46,7 +46,7 @@ def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: def _validate(self, data) -> None: dtype = data.dtype - if not isinstance(dtype, ArrowDtype): + if pa_version_under10p1 or not isinstance(dtype, ArrowDtype): # Raise AttributeError so that inspect can handle non-struct Series. raise AttributeError(self._validation_msg.format(dtype=dtype)) @@ -117,7 +117,10 @@ def len(self) -> Series: value_lengths = pc.list_value_length(self._pa_array) return Series( - value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index + value_lengths, + dtype=ArrowDtype(value_lengths.type), + index=self._data.index, + name=self._data.name, ) def __getitem__(self, key: int | slice) -> Series: @@ -162,7 +165,10 @@ def __getitem__(self, key: int | slice) -> Series: # key = pc.add(key, pc.list_value_length(self._pa_array)) element = pc.list_element(self._pa_array, key) return Series( - element, dtype=ArrowDtype(element.type), index=self._data.index + element, + dtype=ArrowDtype(element.type), + index=self._data.index, + name=self._data.name, ) elif isinstance(key, slice): if pa_version_under11p0: @@ -181,7 +187,12 @@ def __getitem__(self, key: int | slice) -> Series: if step is None: step = 1 sliced = pc.list_slice(self._pa_array, start, stop, step) - return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index) + return Series( + sliced, + dtype=ArrowDtype(sliced.type), + index=self._data.index, + name=self._data.name, + ) else: raise ValueError(f"key must be an int or slice, got {type(key).__name__}") @@ -223,7 +234,12 @@ def flatten(self) -> Series: counts = pa.compute.list_value_length(self._pa_array) flattened = pa.compute.list_flatten(self._pa_array) index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type))) - return Series(flattened, dtype=ArrowDtype(flattened.type), index=index) + return Series( + flattened, + dtype=ArrowDtype(flattened.type), + index=index, + name=self._data.name, + ) class StructAccessor(ArrowAccessor): diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 53f703b701217..0b546bed1c2b7 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -41,6 +41,7 @@ is_list_like, is_numeric_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -257,6 +258,7 @@ class ArrowExtensionArray( Parameters ---------- values : pyarrow.Array or pyarrow.ChunkedArray + The input data to initialize the ArrowExtensionArray. Attributes ---------- @@ -270,6 +272,12 @@ class ArrowExtensionArray( ------- ArrowExtensionArray + See Also + -------- + array : Create a Pandas array with a specified dtype. + DataFrame.to_feather : Write a DataFrame to the binary Feather format. + read_feather : Load a feather-format object from the file path. + Notes ----- Most methods are implemented using `pyarrow compute functions. `__ @@ -668,7 +676,16 @@ def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) + if copy is False: + # TODO: By using `zero_copy_only` it may be possible to implement this + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + elif copy is None: + # `to_numpy(copy=False)` has the meaning of NumPy `copy=None`. + copy = False + + return self.to_numpy(dtype=dtype, copy=copy) def __invert__(self) -> Self: # This is a bit wise op for integer types @@ -734,7 +751,7 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: try: result[valid] = op(np_array[valid], other) except TypeError: - result = ops.invalid_comparison(np_array, other, op) + result = ops.invalid_comparison(self, other, op) result = pa.array(result, type=pa.bool_()) result = pc.if_else(valid, result, None) else: @@ -1136,7 +1153,7 @@ def fillna( try: fill_value = self._box_pa(value, pa_type=self._pa_array.type) except pa.ArrowTypeError as err: - msg = f"Invalid value '{value!s}' for dtype {self.dtype}" + msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'" raise TypeError(msg) from err try: @@ -1381,7 +1398,7 @@ def _to_datetimearray(self) -> DatetimeArray: np_dtype = np.dtype(f"M8[{pa_type.unit}]") dtype = tz_to_dtype(pa_type.tz, pa_type.unit) np_array = self._pa_array.to_numpy() - np_array = np_array.astype(np_dtype) + np_array = np_array.astype(np_dtype, copy=False) return DatetimeArray._simple_new(np_array, dtype=dtype) def _to_timedeltaarray(self) -> TimedeltaArray: @@ -1392,7 +1409,7 @@ def _to_timedeltaarray(self) -> TimedeltaArray: assert pa.types.is_duration(pa_type) np_dtype = np.dtype(f"m8[{pa_type.unit}]") np_array = self._pa_array.to_numpy() - np_array = np_array.astype(np_dtype) + np_array = np_array.astype(np_dtype, copy=False) return TimedeltaArray._simple_new(np_array, dtype=np_dtype) def _values_for_json(self) -> np.ndarray: @@ -1437,8 +1454,7 @@ def to_numpy( pa.types.is_floating(pa_type) and ( na_value is np.nan - or original_na_value is lib.no_default - and is_float_dtype(dtype) + or (original_na_value is lib.no_default and is_float_dtype(dtype)) ) ): result = data._pa_array.to_numpy() @@ -1611,6 +1627,9 @@ def _accumulate( ------ NotImplementedError : subclass does not define accumulations """ + if is_string_dtype(self): + return self._str_accumulate(name=name, skipna=skipna, **kwargs) + pyarrow_name = { "cummax": "cumulative_max", "cummin": "cumulative_min", @@ -1635,13 +1654,68 @@ def _accumulate( else: data_to_accum = data_to_accum.cast(pa.int64()) - result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + try: + result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + except pa.ArrowNotImplementedError as err: + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) from err if convert_to_int: result = result.cast(pa_dtype) return type(self)(result) + def _str_accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ArrowExtensionArray | ExtensionArray: + """ + Accumulate implementation for strings, see `_accumulate` docstring for details. + + pyarrow.compute does not implement these methods for strings. + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: pa.array | None = None + na_mask: pa.array | None = None + pa_array = self._pa_array + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + na_mask = pc.is_null(pa_array) + if pc.all(na_mask) == pa.scalar(True): + return type(self)(pa_array) + if skipna: + if name == "cumsum": + pa_array = pc.fill_null(pa_array, "") + else: + # We can retain the running min/max by forward/backward filling. + pa_array = pc.fill_null_forward(pa_array) + pa_array = pc.fill_null_backward(pa_array) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = pc.index(na_mask, True).as_py() + tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) + pa_array = pa_array[:idx] + + # error: Cannot call function of unknown type + pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator] + + if tail is not None: + pa_result = pa.concat_arrays([pa_result, tail]) + elif na_mask is not None: + pa_result = pc.if_else(na_mask, None, pa_result) + + result = type(self)(pa_result) + return result + def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: """ Return a pyarrow scalar result of performing the reduction operation. @@ -2127,7 +2201,7 @@ def _maybe_convert_setitem_value(self, value): try: value = self._box_pa(value, self._pa_array.type) except pa.ArrowTypeError as err: - msg = f"Invalid value '{value!s}' for dtype {self.dtype}" + msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'" raise TypeError(msg) from err return value @@ -2148,7 +2222,7 @@ def interpolate( """ # NB: we return type(self) even if copy=False if not self.dtype._is_numeric: - raise ValueError("Values must be numeric.") + raise TypeError(f"Cannot interpolate with {self.dtype} dtype") if ( not pa_version_under13p0 @@ -2303,6 +2377,20 @@ def _groupby_op( **kwargs, ): if isinstance(self.dtype, StringDtype): + if how in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + ]: + raise TypeError( + f"dtype '{self.dtype}' does not support operation '{how}'" + ) return super()._groupby_op( how=how, has_dropped_na=has_dropped_na, @@ -2443,8 +2531,6 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): else: dummies_dtype = np.bool_ dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype) - if dtype == str: - dummies[:] = False dummies[indices] = True dummies = dummies.reshape((n_rows, n_cols)) result = type(self)(pa.array(list(dummies))) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5f2c2a7772f78..e831883998098 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2608,6 +2608,21 @@ def _groupby_op( # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray + if op.how in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + "kurt", + ]: + raise TypeError( + f"dtype '{self.dtype}' does not support operation '{how}'" + ) if op.how not in ["any", "all"]: # Fail early to avoid conversion to object op._get_cython_function(op.kind, op.how, np.dtype(object), False) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 7cde4c53cb2f5..ae20bfb6b284b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -579,11 +579,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: raise ValueError("Cannot convert float NaN to integer") elif len(self.codes) == 0 or len(self.categories) == 0: - result = np.array( - self, - dtype=dtype, - copy=copy, - ) + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + if not copy: + result = np.asarray(self, dtype=dtype) + else: + result = np.array(self, dtype=dtype) else: # GH8628 (PERF): astype category codes instead of astyping array @@ -1663,7 +1664,7 @@ def __array__( Specifies the the dtype for the array. copy : bool or None, optional - Unused. + See :func:`numpy.asarray`. Returns ------- @@ -1686,13 +1687,18 @@ def __array__( >>> np.asarray(cat) array(['a', 'b'], dtype=object) """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + ret = take_nd(self.categories._values, self._codes) - if dtype and np.dtype(dtype) != self.categories.dtype: - return np.asarray(ret, dtype) # When we're a Categorical[ExtensionArray], like Interval, # we need to ensure __array__ gets all the way to an # ndarray. - return np.asarray(ret) + + # `take_nd` should already make a copy, so don't force again. + return np.asarray(ret, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # for binary ops, use our custom dunder methods @@ -2730,7 +2736,7 @@ def _groupby_op( op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) dtype = self.dtype - if how in ["sum", "prod", "cumsum", "cumprod", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "skew", "kurt"]: raise TypeError(f"{dtype} type does not support {how} operations") if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered: # raise TypeError instead of NotImplementedError to ensure we diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a25a698856747..8a79ab53442c3 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -359,7 +359,14 @@ def __array__( ) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) return np.array(list(self), dtype=object) + + if copy is True: + return np.array(self._ndarray, dtype=dtype) return self._ndarray @overload @@ -1649,7 +1656,7 @@ def _groupby_op( dtype = self.dtype if dtype.kind == "M": # Adding/multiplying datetimes is not valid - if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]: raise TypeError(f"datetime64 type does not support operation '{how}'") if how in ["any", "all"]: # GH#34479 @@ -1660,7 +1667,7 @@ def _groupby_op( elif isinstance(dtype, PeriodDtype): # Adding/multiplying Periods is not valid - if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]: raise TypeError(f"Period type does not support {how} operations") if how in ["any", "all"]: # GH#34479 @@ -1670,7 +1677,7 @@ def _groupby_op( ) else: # timedeltas we can add but not multiply - if how in ["prod", "cumprod", "skew", "var"]: + if how in ["prod", "cumprod", "skew", "kurt", "var"]: raise TypeError(f"timedelta64 type does not support {how} operations") # All of the functions implemented here are ordinal, so we can @@ -2066,7 +2073,29 @@ def _creso(self) -> int: @cache_readonly def unit(self) -> str: - # e.g. "ns", "us", "ms" + """ + The precision unit of the datetime data. + + Returns the precision unit for the dtype. + It means the smallest time frame that can be stored within this dtype. + + Returns + ------- + str + Unit string representation (e.g. "ns"). + + See Also + -------- + TimelikeOps.as_unit : Converts to a specific unit. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["2020-01-02 01:02:03.004005006"]) + >>> idx.unit + 'ns' + >>> idx.as_unit("s").unit + 's' + """ # error: Argument 1 to "dtype_to_unit" has incompatible type # "ExtensionDtype"; expected "Union[DatetimeTZDtype, dtype[Any]]" return dtype_to_unit(self.dtype) # type: ignore[arg-type] diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f85fbd062b0c3..afbadd754cdbc 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -105,6 +105,12 @@ class IntegerArray(NumericArray): ------- IntegerArray + See Also + -------- + array : Create an array using the appropriate dtype, including ``IntegerArray``. + Int32Dtype : An ExtensionDtype for int32 integer data. + UInt16Dtype : An ExtensionDtype for uint16 integer data. + Examples -------- Create an IntegerArray with :func:`pandas.array`. diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 2ac9c77bef322..0bf2089df5f85 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1055,7 +1055,9 @@ def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray: from pandas import Index fill_value = Index(self._left, copy=False)._na_value - empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) + empty = IntervalArray.from_breaks( + [fill_value] * (empty_len + 1), closed=self.closed + ) else: empty = self._from_sequence([fill_value] * empty_len, dtype=self.dtype) @@ -1233,6 +1235,22 @@ def left(self) -> Index: """ Return the left endpoints of each Interval in the IntervalArray as an Index. + This property provides access to the left endpoints of the intervals + contained within the IntervalArray. This can be useful for analyses where + the starting point of each interval is of interest, such as in histogram + creation, data aggregation, or any scenario requiring the identification + of the beginning of defined ranges. This property returns a ``pandas.Index`` + object containing the midpoint for each interval. + + See Also + -------- + arrays.IntervalArray.right : Return the right endpoints of each Interval in + the IntervalArray as an Index. + arrays.IntervalArray.mid : Return the midpoint of each Interval in the + IntervalArray as an Index. + arrays.IntervalArray.contains : Check elementwise if the Intervals contain + the value. + Examples -------- @@ -1253,6 +1271,21 @@ def right(self) -> Index: """ Return the right endpoints of each Interval in the IntervalArray as an Index. + This property extracts the right endpoints from each interval contained within + the IntervalArray. This can be helpful in use cases where you need to work + with or compare only the upper bounds of intervals, such as when performing + range-based filtering, determining interval overlaps, or visualizing the end + boundaries of data segments. + + See Also + -------- + arrays.IntervalArray.left : Return the left endpoints of each Interval in + the IntervalArray as an Index. + arrays.IntervalArray.mid : Return the midpoint of each Interval in the + IntervalArray as an Index. + arrays.IntervalArray.contains : Check elementwise if the Intervals contain + the value. + Examples -------- @@ -1273,6 +1306,20 @@ def length(self) -> Index: """ Return an Index with entries denoting the length of each Interval. + The length of an interval is calculated as the difference between + its `right` and `left` bounds. This property is particularly useful + when working with intervals where the size of the interval is an important + attribute, such as in time-series analysis or spatial data analysis. + + See Also + -------- + arrays.IntervalArray.left : Return the left endpoints of each Interval in + the IntervalArray as an Index. + arrays.IntervalArray.right : Return the right endpoints of each Interval in + the IntervalArray as an Index. + arrays.IntervalArray.mid : Return the midpoint of each Interval in the + IntervalArray as an Index. + Examples -------- @@ -1606,6 +1653,11 @@ def __array__( Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + left = self._left right = self._right mask = self.isna() diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 92ed690e527c7..f3a0cc0dccdb3 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -286,7 +286,7 @@ def _validate_setitem_value(self, value): # Note: without the "str" here, the f-string rendering raises in # py38 builds. - raise TypeError(f"Invalid value '{value!s}' for dtype {self.dtype}") + raise TypeError(f"Invalid value '{value!s}' for dtype '{self.dtype}'") def __setitem__(self, key, value) -> None: key = check_array_indexer(self, key) @@ -581,7 +581,17 @@ def __array__( the array interface, return my values We return an object array here to preserve our scalar values """ - return self.to_numpy(dtype=dtype) + if copy is False: + if not self._hasna: + # special case, here we can simply return the underlying data + return np.array(self._data, dtype=dtype, copy=copy) + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + if copy is None: + copy = False # The NumPy copy=False meaning is different here. + return self.to_numpy(dtype=dtype, copy=copy) _HANDLED_TYPES: tuple[type, ...] diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index aafcd82114b97..ac0823ed903b3 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -71,6 +71,11 @@ class NumpyExtensionArray( # type: ignore[misc] ------- None + See Also + -------- + array : Create an array. + Series.to_numpy : Convert a Series to a NumPy array. + Examples -------- >>> pd.arrays.NumpyExtensionArray(np.array([0, 1, 2, 3])) @@ -150,6 +155,9 @@ def dtype(self) -> NumpyEADtype: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: + if copy is not None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.array(self._ndarray, dtype=dtype, copy=copy) return np.asarray(self._ndarray, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): @@ -284,6 +292,9 @@ def interpolate( See NDFrame.interpolate.__doc__. """ # NB: we return type(self) even if copy=False + if not self.dtype._is_numeric: + raise TypeError(f"Cannot interpolate with {self.dtype} dtype") + if not copy: out_data = self._ndarray else: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 7d0ad74f851f0..ae92e17332c76 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -390,8 +390,19 @@ def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: if dtype == "i8": - return self.asi8 - elif dtype == bool: + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + if not copy: + return np.asarray(self.asi8, dtype=dtype) + else: + return np.array(self.asi8, dtype=dtype) + + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + if dtype == bool: return ~self._isnan # This will raise TypeError for non-object dtypes diff --git a/pandas/core/arrays/sparse/__init__.py b/pandas/core/arrays/sparse/__init__.py index adf83963aca39..93d5cb8cc335a 100644 --- a/pandas/core/arrays/sparse/__init__.py +++ b/pandas/core/arrays/sparse/__init__.py @@ -12,8 +12,8 @@ __all__ = [ "BlockIndex", "IntIndex", - "make_sparse_index", "SparseAccessor", "SparseArray", "SparseFrameAccessor", + "make_sparse_index", ] diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 0c76280e7fdb4..137dbb6e4d139 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -289,12 +289,18 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): """ An ExtensionArray for storing sparse data. + SparseArray efficiently stores data with a high frequency of a + specific fill value (e.g., zeros), saving memory by only retaining + non-fill elements and their indices. This class is particularly + useful for large datasets where most values are redundant. + Parameters ---------- data : array-like or scalar A dense array of values to store in the SparseArray. This may contain `fill_value`. sparse_index : SparseIndex, optional + Index indicating the locations of sparse elements. fill_value : scalar, optional Elements in data that are ``fill_value`` are not stored in the SparseArray. For memory savings, this should be the most common value @@ -345,6 +351,10 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): ------- None + See Also + -------- + SparseDtype : Dtype for sparse data. + Examples -------- >>> from pandas.arrays import SparseArray @@ -547,11 +557,20 @@ def from_spmatrix(cls, data: spmatrix) -> Self: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: - fill_value = self.fill_value - if self.sp_index.ngaps == 0: # Compat for na dtype and int values. - return self.sp_values + if copy is True: + return np.array(self.sp_values) + else: + return self.sp_values + + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + fill_value = self.fill_value + if dtype is None: # Can NumPy represent this type? # If not, `np.result_type` will raise. We catch that diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 93c678f606fcd..623a6a10c75b5 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial import operator from typing import ( TYPE_CHECKING, @@ -27,7 +28,10 @@ pa_version_under10p1, ) from pandas.compat.numpy import function as nv -from pandas.util._decorators import doc +from pandas.util._decorators import ( + doc, + set_module, +) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ( @@ -64,6 +68,8 @@ from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna +from pandas.io.formats import printing + if TYPE_CHECKING: import pyarrow @@ -83,6 +89,7 @@ from pandas import Series +@set_module("pandas") @register_extension_dtype class StringDtype(StorageExtensionDtype): """ @@ -391,6 +398,14 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: raise ValueError return cls._from_sequence(scalars, dtype=dtype) + def _formatter(self, boxed: bool = False): + formatter = partial( + printing.pprint_thing, + escape_chars=("\t", "\r", "\n"), + quote_strings=not boxed, + ) + return formatter + def _str_map( self, f, @@ -518,6 +533,11 @@ def _str_map_nan_semantics( else: return self._str_map_str_or_object(dtype, na_value, arr, f, mask) + def view(self, dtype: Dtype | None = None) -> ArrayLike: + if dtype is not None: + raise TypeError("Cannot change data-type for string array.") + return super().view(dtype=dtype) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" @@ -641,7 +661,8 @@ def _validate_scalar(self, value): return self.dtype.na_value elif not isinstance(value, str): raise TypeError( - f"Cannot set non-string value '{value}' into a string array." + f"Invalid value '{value}' for dtype '{self.dtype}'. Value should be a " + f"string or missing value, got '{type(value).__name__}' instead." ) return value @@ -714,40 +735,52 @@ def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: return arr, self.dtype.na_value - def __setitem__(self, key, value) -> None: - value = extract_array(value, extract_numpy=True) - if isinstance(value, type(self)): - # extract_array doesn't extract NumpyExtensionArray subclasses - value = value._ndarray - - key = check_array_indexer(self, key) - scalar_key = lib.is_scalar(key) - scalar_value = lib.is_scalar(value) - if scalar_key and not scalar_value: - raise ValueError("setting an array element with a sequence.") - - # validate new items - if scalar_value: + def _maybe_convert_setitem_value(self, value): + """Maybe convert value to be pyarrow compatible.""" + if lib.is_scalar(value): if isna(value): value = self.dtype.na_value elif not isinstance(value, str): raise TypeError( - f"Cannot set non-string value '{value}' into a StringArray." + f"Invalid value '{value}' for dtype '{self.dtype}'. Value should " + f"be a string or missing value, got '{type(value).__name__}' " + "instead." ) else: + value = extract_array(value, extract_numpy=True) if not is_array_like(value): value = np.asarray(value, dtype=object) + elif isinstance(value.dtype, type(self.dtype)): + return value else: # cast categories and friends to arrays to see if values are # compatible, compatibility with arrow backed strings value = np.asarray(value) if len(value) and not lib.is_string_array(value, skipna=True): - raise TypeError("Must provide strings.") + raise TypeError( + "Invalid value for dtype 'str'. Value should be a " + "string or missing value (or array of those)." + ) + return value + + def __setitem__(self, key, value) -> None: + value = self._maybe_convert_setitem_value(value) + + key = check_array_indexer(self, key) + scalar_key = lib.is_scalar(key) + scalar_value = lib.is_scalar(value) + if scalar_key and not scalar_value: + raise ValueError("setting an array element with a sequence.") - mask = isna(value) - if mask.any(): - value = value.copy() - value[isna(value)] = self.dtype.na_value + if not scalar_value: + if value.dtype == self.dtype: + value = value._ndarray + else: + value = np.asarray(value) + mask = isna(value) + if mask.any(): + value = value.copy() + value[isna(value)] = self.dtype.na_value super().__setitem__(key, value) @@ -757,6 +790,12 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: # base class implementation that uses __setitem__ ExtensionArray._putmask(self, mask, value) + def _where(self, mask: npt.NDArray[np.bool_], value) -> Self: + # the super() method NDArrayBackedExtensionArray._where uses + # np.putmask which doesn't properly handle None/pd.NA, so using the + # base class implementation that uses __setitem__ + return ExtensionArray._where(self, mask, value) + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: if isinstance(values, BaseStringArray) or ( isinstance(values, ExtensionArray) and is_string_dtype(values.dtype) @@ -823,7 +862,7 @@ def _reduce( else: return nanops.nanall(self._ndarray, skipna=skipna) - if name in ["min", "max", "sum"]: + if name in ["min", "max", "argmin", "argmax", "sum"]: result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs) if keepdims: return self._from_sequence([result], dtype=self.dtype) @@ -915,7 +954,6 @@ def _cmp_method(self, other, op): if not is_array_like(other): other = np.asarray(other) other = other[valid] - other = np.asarray(other) if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray, dtype="object") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index cde39c7f4dc6a..d35083fd892a8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -17,6 +17,7 @@ from pandas.compat import ( pa_version_under10p1, pa_version_under13p0, + pa_version_under16p0, ) from pandas.util._exceptions import find_stack_level @@ -71,6 +72,10 @@ def _chk_pyarrow_available() -> None: raise ImportError(msg) +def _is_string_view(typ): + return not pa_version_under16p0 and pa.types.is_string_view(typ) + + # TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from # ObjectStringArrayMixin because we want to have the object-dtype based methods as # fallback for the ones that pyarrow doesn't yet support @@ -128,11 +133,13 @@ def __init__(self, values) -> None: _chk_pyarrow_available() if isinstance(values, (pa.Array, pa.ChunkedArray)) and ( pa.types.is_string(values.type) + or _is_string_view(values.type) or ( pa.types.is_dictionary(values.type) and ( pa.types.is_string(values.type.value_type) or pa.types.is_large_string(values.type.value_type) + or _is_string_view(values.type.value_type) ) ) ): @@ -216,7 +223,10 @@ def insert(self, loc: int, item) -> ArrowStringArray: if self.dtype.na_value is np.nan and item is np.nan: item = libmissing.NA if not isinstance(item, str) and item is not libmissing.NA: - raise TypeError("Scalar must be NA or str") + raise TypeError( + f"Invalid value '{item}' for dtype 'str'. Value should be a " + f"string or missing value, got '{type(item).__name__}' instead." + ) return super().insert(loc, item) def _convert_bool_result(self, values, na=lib.no_default, method_name=None): @@ -248,13 +258,19 @@ def _maybe_convert_setitem_value(self, value): if isna(value): value = None elif not isinstance(value, str): - raise TypeError("Scalar must be NA or str") + raise TypeError( + f"Invalid value '{value}' for dtype 'str'. Value should be a " + f"string or missing value, got '{type(value).__name__}' instead." + ) else: value = np.array(value, dtype=object, copy=True) value[isna(value)] = None for v in value: if not (v is None or isinstance(v, str)): - raise TypeError("Must provide strings") + raise TypeError( + "Invalid value for dtype 'str'. Value should be a " + "string or missing value (or array of those)." + ) return super()._maybe_convert_setitem_value(value) def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: @@ -465,6 +481,9 @@ def _cmp_method(self, other, op): return result.to_numpy(np.bool_, na_value=False) return result + def __pos__(self) -> Self: + raise TypeError(f"bad operand type for unary +: '{self.dtype}'") + class ArrowStringArrayNumpySemantics(ArrowStringArray): _na_value = np.nan diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a8a0037d0bbb9..c5b3129c506c8 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -115,10 +115,10 @@ class TimedeltaArray(dtl.TimelikeOps): ---------- data : array-like The timedelta data. - dtype : numpy.dtype Currently, only ``numpy.dtype("timedelta64[ns]")`` is accepted. freq : Offset, optional + Frequency of the data. copy : bool, default False Whether to copy the underlying array of data. @@ -130,6 +130,12 @@ class TimedeltaArray(dtl.TimelikeOps): ------- None + See Also + -------- + Timedelta : Represents a duration, the difference between two dates or times. + TimedeltaIndex : Immutable Index of timedelta64 data. + to_timedelta : Convert argument to timedelta. + Examples -------- >>> pd.arrays.TimedeltaArray._from_sequence(pd.TimedeltaIndex(["1h", "2h"])) diff --git a/pandas/core/base.py b/pandas/core/base.py index 58572aab5b20f..a64cd8633c1db 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -361,8 +361,11 @@ def __len__(self) -> int: # We need this defined here for mypy raise AbstractMethodError(self) + # Temporarily avoid using `-> Literal[1]:` because of an IPython (jedi) bug + # https://github.com/ipython/ipython/issues/14412 + # https://github.com/davidhalter/jedi/issues/1990 @property - def ndim(self) -> Literal[1]: + def ndim(self) -> int: """ Number of dimensions of the underlying data, by definition 1. @@ -503,6 +506,11 @@ def array(self) -> ExtensionArray: """ The ExtensionArray of the data backing this Series or Index. + This property provides direct access to the underlying array data of a + Series or Index without requiring conversion to a NumPy array. It + returns an ExtensionArray, which is the native storage format for + pandas extension dtypes. + Returns ------- ExtensionArray diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 4ccfbd71d9ce8..9d844e590582a 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -190,8 +190,8 @@ def eval( .. warning:: - ``eval`` can run arbitrary code which can make you vulnerable to code - injection and untrusted data. + This function can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. Parameters ---------- @@ -371,10 +371,12 @@ def eval( is_extension_array_dtype(parsed_expr.terms.return_type) and not is_string_dtype(parsed_expr.terms.return_type) ) - or getattr(parsed_expr.terms, "operand_types", None) is not None - and any( - (is_extension_array_dtype(elem) and not is_string_dtype(elem)) - for elem in parsed_expr.terms.operand_types + or ( + getattr(parsed_expr.terms, "operand_types", None) is not None + and any( + (is_extension_array_dtype(elem) and not is_string_dtype(elem)) + for elem in parsed_expr.terms.operand_types + ) ) ): warnings.warn( diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 7025d8a72e561..010fad1bbf0b6 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -512,8 +512,7 @@ def _maybe_evaluate_binop( ) if self.engine != "pytables" and ( - res.op in CMP_OPS_SYMS - and getattr(lhs, "is_datetime", False) + (res.op in CMP_OPS_SYMS and getattr(lhs, "is_datetime", False)) or getattr(rhs, "is_datetime", False) ): # all date ops must be done in python bc numexpr doesn't work diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index e2acd9a2c97c2..5a5fad0d83d7a 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -65,23 +65,23 @@ def set_numexpr_threads(n=None) -> None: ne.set_num_threads(n) -def _evaluate_standard(op, op_str, a, b): +def _evaluate_standard(op, op_str, left_op, right_op): """ Standard evaluation. """ if _TEST_MODE: _store_test_result(False) - return op(a, b) + return op(left_op, right_op) -def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool: - """return a boolean if we WILL be using numexpr""" +def _can_use_numexpr(op, op_str, left_op, right_op, dtype_check) -> bool: + """return left_op boolean if we WILL be using numexpr""" if op_str is not None: # required min elements (otherwise we are adding overhead) - if a.size > _MIN_ELEMENTS: + if left_op.size > _MIN_ELEMENTS: # check for dtype compatibility dtypes: set[str] = set() - for o in [a, b]: + for o in [left_op, right_op]: # ndarray and Series Case if hasattr(o, "dtype"): dtypes |= {o.dtype.name} @@ -93,22 +93,22 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool: return False -def _evaluate_numexpr(op, op_str, a, b): +def _evaluate_numexpr(op, op_str, left_op, right_op): result = None - if _can_use_numexpr(op, op_str, a, b, "evaluate"): + if _can_use_numexpr(op, op_str, left_op, right_op, "evaluate"): is_reversed = op.__name__.strip("_").startswith("r") if is_reversed: # we were originally called by a reversed op method - a, b = b, a + left_op, right_op = right_op, left_op - a_value = a - b_value = b + left_value = left_op + right_value = right_op try: result = ne.evaluate( - f"a_value {op_str} b_value", - local_dict={"a_value": a_value, "b_value": b_value}, + f"left_value {op_str} right_value", + local_dict={"left_value": left_value, "right_value": right_value}, casting="safe", ) except TypeError: @@ -116,20 +116,20 @@ def _evaluate_numexpr(op, op_str, a, b): # (https://github.com/pydata/numexpr/issues/379) pass except NotImplementedError: - if _bool_arith_fallback(op_str, a, b): + if _bool_arith_fallback(op_str, left_op, right_op): pass else: raise if is_reversed: # reverse order to original for fallback - a, b = b, a + left_op, right_op = right_op, left_op if _TEST_MODE: _store_test_result(result is not None) if result is None: - result = _evaluate_standard(op, op_str, a, b) + result = _evaluate_standard(op, op_str, left_op, right_op) return result @@ -170,24 +170,24 @@ def _evaluate_numexpr(op, op_str, a, b): } -def _where_standard(cond, a, b): +def _where_standard(cond, left_op, right_op): # Caller is responsible for extracting ndarray if necessary - return np.where(cond, a, b) + return np.where(cond, left_op, right_op) -def _where_numexpr(cond, a, b): +def _where_numexpr(cond, left_op, right_op): # Caller is responsible for extracting ndarray if necessary result = None - if _can_use_numexpr(None, "where", a, b, "where"): + if _can_use_numexpr(None, "where", left_op, right_op, "where"): result = ne.evaluate( "where(cond_value, a_value, b_value)", - local_dict={"cond_value": cond, "a_value": a, "b_value": b}, + local_dict={"cond_value": cond, "a_value": left_op, "b_value": right_op}, casting="safe", ) if result is None: - result = _where_standard(cond, a, b) + result = _where_standard(cond, left_op, right_op) return result @@ -206,13 +206,13 @@ def _has_bool_dtype(x): _BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"} -def _bool_arith_fallback(op_str, a, b) -> bool: +def _bool_arith_fallback(op_str, left_op, right_op) -> bool: """ Check if we should fallback to the python `_evaluate_standard` in case of an unsupported operation by numexpr, which is the case for some boolean ops. """ - if _has_bool_dtype(a) and _has_bool_dtype(b): + if _has_bool_dtype(left_op) and _has_bool_dtype(right_op): if op_str in _BOOL_OP_UNSUPPORTED: warnings.warn( f"evaluating in Python space because the {op_str!r} " @@ -224,15 +224,15 @@ def _bool_arith_fallback(op_str, a, b) -> bool: return False -def evaluate(op, a, b, use_numexpr: bool = True): +def evaluate(op, left_op, right_op, use_numexpr: bool = True): """ - Evaluate and return the expression of the op on a and b. + Evaluate and return the expression of the op on left_op and right_op. Parameters ---------- op : the actual operand - a : left operand - b : right operand + left_op : left operand + right_op : right operand use_numexpr : bool, default True Whether to try to use numexpr. """ @@ -240,24 +240,27 @@ def evaluate(op, a, b, use_numexpr: bool = True): if op_str is not None: if use_numexpr: # error: "None" not callable - return _evaluate(op, op_str, a, b) # type: ignore[misc] - return _evaluate_standard(op, op_str, a, b) + return _evaluate(op, op_str, left_op, right_op) # type: ignore[misc] + return _evaluate_standard(op, op_str, left_op, right_op) -def where(cond, a, b, use_numexpr: bool = True): +def where(cond, left_op, right_op, use_numexpr: bool = True): """ - Evaluate the where condition cond on a and b. + Evaluate the where condition cond on left_op and right_op. Parameters ---------- cond : np.ndarray[bool] - a : return if cond is True - b : return if cond is False + left_op : return if cond is True + right_op : return if cond is False use_numexpr : bool, default True Whether to try to use numexpr. """ assert _where is not None - return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b) + if use_numexpr: + return _where(cond, left_op, right_op) + else: + return _where_standard(cond, left_op, right_op) def set_test_mode(v: bool = True) -> None: diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index a1a5f77f8539e..9b26de42e119b 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -76,8 +76,7 @@ class Term: def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls - # error: Argument 2 for "super" not an instance of argument 1 - supr_new = super(Term, klass).__new__ # type: ignore[misc] + supr_new = super(Term, klass).__new__ return supr_new(klass) is_local: bool diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 39511048abf49..166c9d47294cd 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -205,7 +205,7 @@ def generate(self, v) -> str: val = v.tostring(self.encoding) return f"({self.lhs} {self.op} {val})" - def convert_value(self, v) -> TermValue: + def convert_value(self, conv_val) -> TermValue: """ convert the expression that is in the term to something that is accepted by pytables @@ -219,44 +219,44 @@ def stringify(value): kind = ensure_decoded(self.kind) meta = ensure_decoded(self.meta) if kind == "datetime" or (kind and kind.startswith("datetime64")): - if isinstance(v, (int, float)): - v = stringify(v) - v = ensure_decoded(v) - v = Timestamp(v).as_unit("ns") - if v.tz is not None: - v = v.tz_convert("UTC") - return TermValue(v, v._value, kind) + if isinstance(conv_val, (int, float)): + conv_val = stringify(conv_val) + conv_val = ensure_decoded(conv_val) + conv_val = Timestamp(conv_val).as_unit("ns") + if conv_val.tz is not None: + conv_val = conv_val.tz_convert("UTC") + return TermValue(conv_val, conv_val._value, kind) elif kind in ("timedelta64", "timedelta"): - if isinstance(v, str): - v = Timedelta(v) + if isinstance(conv_val, str): + conv_val = Timedelta(conv_val) else: - v = Timedelta(v, unit="s") - v = v.as_unit("ns")._value - return TermValue(int(v), v, kind) + conv_val = Timedelta(conv_val, unit="s") + conv_val = conv_val.as_unit("ns")._value + return TermValue(int(conv_val), conv_val, kind) elif meta == "category": metadata = extract_array(self.metadata, extract_numpy=True) result: npt.NDArray[np.intp] | np.intp | int - if v not in metadata: + if conv_val not in metadata: result = -1 else: - result = metadata.searchsorted(v, side="left") + result = metadata.searchsorted(conv_val, side="left") return TermValue(result, result, "integer") elif kind == "integer": try: - v_dec = Decimal(v) + v_dec = Decimal(conv_val) except InvalidOperation: # GH 54186 # convert v to float to raise float's ValueError - float(v) + float(conv_val) else: - v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) - return TermValue(v, v, kind) + conv_val = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) + return TermValue(conv_val, conv_val, kind) elif kind == "float": - v = float(v) - return TermValue(v, v, kind) + conv_val = float(conv_val) + return TermValue(conv_val, conv_val, kind) elif kind == "bool": - if isinstance(v, str): - v = v.strip().lower() not in [ + if isinstance(conv_val, str): + conv_val = conv_val.strip().lower() not in [ "false", "f", "no", @@ -268,13 +268,15 @@ def stringify(value): "", ] else: - v = bool(v) - return TermValue(v, v, kind) - elif isinstance(v, str): + conv_val = bool(conv_val) + return TermValue(conv_val, conv_val, kind) + elif isinstance(conv_val, str): # string quoting - return TermValue(v, stringify(v), "string") + return TermValue(conv_val, stringify(conv_val), "string") else: - raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column") + raise TypeError( + f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column" + ) def convert_values(self) -> None: pass @@ -408,11 +410,12 @@ def prune(self, klass): operand = operand.prune(klass) if operand is not None and ( - issubclass(klass, ConditionBinOp) - and operand.condition is not None - or not issubclass(klass, ConditionBinOp) - and issubclass(klass, FilterBinOp) - and operand.filter is not None + (issubclass(klass, ConditionBinOp) and operand.condition is not None) + or ( + not issubclass(klass, ConditionBinOp) + and issubclass(klass, FilterBinOp) + and operand.filter is not None + ) ): return operand.invert() return None diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 7b31e03e58b4b..336d62b9d9579 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -140,7 +140,7 @@ class Scope: temps : dict """ - __slots__ = ["level", "scope", "target", "resolvers", "temps"] + __slots__ = ["level", "resolvers", "scope", "target", "temps"] level: int scope: DeepChainMap resolvers: DeepChainMap diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index e4eefb570fd95..20fe8cbab1c9f 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -100,7 +100,10 @@ def use_numba_cb(key: str) -> None: : int If max_rows is exceeded, switch to truncate view. Depending on `large_repr`, objects are either centrally truncated or printed as - a summary view. 'None' value means unlimited. + a summary view. + + 'None' value means unlimited. Beware that printing a large number of rows + could cause your rendering environment (the browser, etc.) to crash. In case python/IPython is running in a terminal and `large_repr` equals 'truncate' this can be set to 0 and pandas will auto-detect @@ -121,7 +124,11 @@ def use_numba_cb(key: str) -> None: : int If max_cols is exceeded, switch to truncate view. Depending on `large_repr`, objects are either centrally truncated or printed as - a summary view. 'None' value means unlimited. + a summary view. + + 'None' value means unlimited. Beware that printing a large number of + columns could cause your rendering environment (the browser, etc.) to + crash. In case python/IPython is running in a terminal and `large_repr` equals 'truncate' this can be set to 0 or None and pandas will auto-detect diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 1e1292f8ef089..50088804e0245 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -596,6 +596,8 @@ def sanitize_array( # create an extension array from its dtype _sanitize_non_ordered(data) cls = dtype.construct_array_type() + if not hasattr(data, "__array__"): + data = list(data) subarr = cls._from_sequence(data, dtype=dtype, copy=copy) # GH#846 @@ -807,6 +809,12 @@ def _try_cast( ) elif dtype.kind in "mM": + if is_ndarray: + arr = cast(np.ndarray, arr) + if arr.ndim == 2 and arr.shape[1] == 1: + # GH#60081: DataFrame Constructor converts 1D data to array of + # shape (N, 1), but maybe_cast_to_datetime assumes 1D input + return maybe_cast_to_datetime(arr[:, 0], dtype).reshape(arr.shape) return maybe_cast_to_datetime(arr, dtype) # GH#15832: Check if we are requesting a numeric dtype and diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6ba07b1761557..02b9291da9b31 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -87,8 +87,8 @@ if TYPE_CHECKING: from collections.abc import ( + Collection, Sequence, - Sized, ) from pandas._typing import ( @@ -1162,6 +1162,7 @@ def convert_dtypes( def maybe_infer_to_datetimelike( value: npt.NDArray[np.object_], + convert_to_nullable_dtype: bool = False, ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray: """ we might have a array (or single object) that is datetime like, @@ -1199,13 +1200,14 @@ def maybe_infer_to_datetimelike( # numpy would have done it for us. convert_numeric=False, convert_non_numeric=True, + convert_to_nullable_dtype=convert_to_nullable_dtype, dtype_if_all_nat=np.dtype("M8[s]"), ) def maybe_cast_to_datetime( value: np.ndarray | list, dtype: np.dtype -) -> ExtensionArray | np.ndarray: +) -> DatetimeArray | TimedeltaArray | np.ndarray: """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT @@ -1579,7 +1581,7 @@ def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj): return _maybe_unbox_datetimelike(value, dtype) -def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: +def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object dtype. @@ -1597,11 +1599,9 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: ------- 1-dimensional numpy array of dtype object """ - # numpy will try to interpret nested lists as further dimensions, hence - # making a 1D array that contains list-likes is a bit tricky: - result = np.empty(len(values), dtype="object") - result[:] = values - return result + # numpy will try to interpret nested lists as further dimensions in np.array(), + # hence explicitly making a 1D array using np.fromiter + return np.fromiter(values, dtype="object", count=len(values)) def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray: @@ -1749,6 +1749,13 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: except (ValueError, TypeError): return False + if dtype == "string": + try: + arr._maybe_convert_setitem_value(element) # type: ignore[union-attr] + return True + except (ValueError, TypeError): + return False + # This is technically incorrect, but maintains the behavior of # ExtensionBlock._can_hold_element return True diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 98c770ec4a8b0..b0c8ec1ffc083 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -430,7 +430,7 @@ def is_period_dtype(arr_or_dtype) -> bool: Check whether an array-like or dtype is of the Period dtype. .. deprecated:: 2.2.0 - Use isinstance(dtype, pd.Period) instead. + Use isinstance(dtype, pd.PeriodDtype) instead. Parameters ---------- @@ -1785,16 +1785,22 @@ def pandas_dtype(dtype) -> DtypeObj: Parameters ---------- - dtype : object to be converted + dtype : object + The object to be converted into a dtype. Returns ------- np.dtype or a pandas dtype + The converted dtype, which can be either a numpy dtype or a pandas dtype. Raises ------ TypeError if not a dtype + See Also + -------- + api.types.is_dtype : Return true if the condition is satisfied for the arr_or_dtype. + Examples -------- >>> pd.api.types.pandas_dtype(int) @@ -1883,13 +1889,14 @@ def is_all_strings(value: ArrayLike) -> bool: __all__ = [ - "classes", "DT64NS_DTYPE", + "INT64_DTYPE", + "TD64NS_DTYPE", + "classes", "ensure_float64", "ensure_python_int", "ensure_str", "infer_dtype_from_object", - "INT64_DTYPE", "is_1d_only_ea_dtype", "is_all_strings", "is_any_real_numeric_dtype", @@ -1934,6 +1941,5 @@ def is_all_strings(value: ArrayLike) -> bool: "is_unsigned_integer_dtype", "needs_i8_conversion", "pandas_dtype", - "TD64NS_DTYPE", "validate_all_hashable", ] diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index bb6610c514375..1eb1a630056a2 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -48,6 +48,7 @@ from pandas._libs.tslibs.offsets import BDay from pandas.compat import pa_version_under10p1 from pandas.errors import PerformanceWarning +from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ( @@ -72,7 +73,7 @@ from collections.abc import MutableMapping from datetime import tzinfo - import pyarrow as pa # noqa: TCH004 + import pyarrow as pa # noqa: TC004 from pandas._typing import ( Dtype, @@ -155,6 +156,7 @@ class CategoricalDtypeType(type): @register_extension_dtype +@set_module("pandas") class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): """ Type for categorical data with the categories and orderedness. @@ -706,6 +708,7 @@ def index_class(self) -> type_t[CategoricalIndex]: @register_extension_dtype +@set_module("pandas") class DatetimeTZDtype(PandasExtensionDtype): """ An ExtensionDtype for timezone-aware datetime data. @@ -974,6 +977,7 @@ def index_class(self) -> type_t[DatetimeIndex]: @register_extension_dtype +@set_module("pandas") class PeriodDtype(PeriodDtypeBase, PandasExtensionDtype): """ An ExtensionDtype for Period data. @@ -1111,10 +1115,8 @@ def construct_from_string(cls, string: str_type) -> PeriodDtype: possible """ if ( - isinstance(string, str) - and (string.startswith(("period[", "Period["))) - or isinstance(string, BaseOffset) - ): + isinstance(string, str) and (string.startswith(("period[", "Period["))) + ) or isinstance(string, BaseOffset): # do not parse string like U as period[U] # avoid tuple to be regarded as freq try: @@ -1215,6 +1217,7 @@ def index_class(self) -> type_t[PeriodIndex]: @register_extension_dtype +@set_module("pandas") class IntervalDtype(PandasExtensionDtype): """ An ExtensionDtype for Interval data. @@ -1691,6 +1694,7 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: @register_extension_dtype +@set_module("pandas") class SparseDtype(ExtensionDtype): """ Dtype for data stored in :class:`SparseArray`. @@ -2124,12 +2128,15 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: PerformanceWarning, stacklevel=find_stack_level(), ) - np_dtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes) - return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value) + # error: Argument 1 to "np_find_common_type" has incompatible type + # "*Generator[Any | dtype[Any] | ExtensionDtype, None, None]"; + # expected "dtype[Any]" [arg-type] + return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value) # type: ignore [arg-type] @register_extension_dtype +@set_module("pandas") class ArrowDtype(StorageExtensionDtype): """ An ExtensionDtype for PyArrow data types. @@ -2332,9 +2339,11 @@ def construct_from_string(cls, string: str) -> ArrowDtype: ) if not string.endswith("[pyarrow]"): raise TypeError(f"'{string}' must end with '[pyarrow]'") - if string == "string[pyarrow]": + if string in ("string[pyarrow]", "str[pyarrow]"): # Ensure Registry.find skips ArrowDtype to use StringDtype instead raise TypeError("string[pyarrow] should be constructed by StringDtype") + if pa_version_under10p1: + raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") base_type = string[:-9] # get rid of "[pyarrow]" try: diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 6adb34ff0f777..918d107f2ce6c 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -190,12 +190,17 @@ def is_re_compilable(obj: object) -> bool: Parameters ---------- obj : The object to check + The object to check if the object can be compiled into a regex pattern instance. Returns ------- bool Whether `obj` can be compiled as a regex pattern. + See Also + -------- + api.types.is_re : Check if the object is a regex pattern instance. + Examples -------- >>> from pandas.api.types import is_re_compilable diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index b9cd6ae2f13e8..f20ca44728664 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -19,6 +19,7 @@ NaT, iNaT, ) +from pandas.util._decorators import set_module from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -93,6 +94,7 @@ def isna( def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: ... +@set_module("pandas") def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: """ Detect missing values for an array-like object. @@ -307,6 +309,7 @@ def notna( def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: ... +@set_module("pandas") def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: """ Detect non-missing values for an array-like object. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c4defdb24370f..d9f7623064e05 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1018,7 +1018,7 @@ def shape(self) -> tuple[int, int]: See Also -------- - ndarray.shape : Tuple of array dimensions. + numpy.ndarray.shape : Tuple of array dimensions. Examples -------- @@ -1397,6 +1397,11 @@ def style(self) -> Styler: Please see `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. """ + # Raise AttributeError so that inspect works even if jinja2 is not installed. + has_jinja2 = import_optional_dependency("jinja2", errors="ignore") + if not has_jinja2: + raise AttributeError("The '.style' accessor requires jinja2") + from pandas.io.formats.style import Styler return Styler(self) @@ -2110,8 +2115,8 @@ def from_records( """ Convert structured or record ndarray to DataFrame. - Creates a DataFrame object from a structured ndarray, sequence of - tuples or dicts, or DataFrame. + Creates a DataFrame object from a structured ndarray, or sequence of + tuples or dicts. Parameters ---------- @@ -2312,7 +2317,10 @@ def maybe_reorder( columns = columns.drop(exclude) mgr = arrays_to_mgr(arrays, columns, result_index) - return cls._from_mgr(mgr, axes=mgr.axes) + df = DataFrame._from_mgr(mgr, axes=mgr.axes) + if cls is not DataFrame: + return cls(df, copy=False) + return df def to_records( self, index: bool = True, column_dtypes=None, index_dtypes=None @@ -3924,8 +3932,7 @@ def __getitem__(self, key): # GH#45316 Return view if key is not duplicated # Only use drop_duplicates with duplicates for performance if not is_mi and ( - self.columns.is_unique - and key in self.columns + (self.columns.is_unique and key in self.columns) or key in self.columns.drop_duplicates(keep=False) ): return self._get_item(key) @@ -4472,8 +4479,10 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No """ Query the columns of a DataFrame with a boolean expression. - This method can run arbitrary code which can make you vulnerable to code - injection if you pass user input to this function. + .. warning:: + + This method can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. Parameters ---------- @@ -4630,6 +4639,11 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: """ Evaluate a string describing operations on DataFrame columns. + .. warning:: + + This method can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. + Operates on columns only, not specific rows or elements. This allows `eval` to run arbitrary code, which can make you vulnerable to code injection if you pass user input to this function. @@ -4737,7 +4751,8 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: 3 4 4 7 8 0 4 5 2 6 7 3 - For columns with spaces in their name, you can use backtick quoting. + For columns with spaces or other disallowed characters in their name, you can + use backtick quoting. >>> df.eval("B * `C&C`") 0 100 @@ -5004,7 +5019,7 @@ def assign(self, **kwargs) -> DataFrame: Parameters ---------- - **kwargs : dict of {str: callable or Series} + **kwargs : callable or Series The column names are keywords. If the values are callable, they are computed on the DataFrame and assigned to the new columns. The callable must not @@ -5705,7 +5720,7 @@ def shift( "Passing a 'freq' together with a 'fill_value' is not allowed." ) - if self.empty: + if self.empty and freq is None: return self.copy() axis = self._get_axis_number(axis) @@ -6770,8 +6785,7 @@ def f(vals) -> tuple[np.ndarray, int]: elif ( not np.iterable(subset) or isinstance(subset, str) - or isinstance(subset, tuple) - and subset in self.columns + or (isinstance(subset, tuple) and subset in self.columns) ): subset = (subset,) @@ -6876,7 +6890,8 @@ def sort_values( builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect a ``Series`` and return a Series with the same shape as the input. - It will be applied to each column in `by` independently. + It will be applied to each column in `by` independently. The values in the + returned Series will be used as the keys for sorting. Returns ------- @@ -7956,6 +7971,16 @@ def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame: new_left = left if lcol_indexer is None else left.iloc[:, lcol_indexer] new_right = right if rcol_indexer is None else right.iloc[:, rcol_indexer] + + # GH#60498 For MultiIndex column alignment + if isinstance(cols, MultiIndex): + # When overwriting column names, make a shallow copy so as to not modify + # the input DFs + new_left = new_left.copy(deep=False) + new_right = new_right.copy(deep=False) + new_left.columns = cols + new_right.columns = cols + result = op(new_left, new_right) # Do the join on the columns instead of using left._align_for_op @@ -7986,6 +8011,13 @@ def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> b if not isinstance(right, DataFrame): return False + if ( + isinstance(self.columns, MultiIndex) + or isinstance(right.columns, MultiIndex) + ) and not self.columns.equals(right.columns): + # GH#60498 Reindex if MultiIndexe columns are not matching + return True + if fill_value is None and level is None and axis == 1: # TODO: any other cases we should handle here? @@ -8639,6 +8671,7 @@ def combine( 2 NaN 3.0 1.0 """ other_idxlen = len(other.index) # save for compare + other_columns = other.columns this, other = self.align(other) new_index = this.index @@ -8649,8 +8682,8 @@ def combine( if self.empty and len(other) == other_idxlen: return other.copy() - # sorts if possible; otherwise align above ensures that these are set-equal - new_columns = this.columns.union(other.columns) + # preserve column order + new_columns = self.columns.union(other_columns, sort=False) do_fill = fill_value is not None result = {} for col in new_columns: @@ -13640,6 +13673,10 @@ def isin_(x): doc=""" The column labels of the DataFrame. + This property holds the column names as a pandas ``Index`` object. + It provides an immutable sequence of column labels that can be + used for data selection, renaming, and alignment in DataFrame operations. + Returns ------- pandas.Index diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1759e1ef91d85..e0a4f9d9c546a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -603,9 +603,9 @@ def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: dtypes = self.dtypes return { clean_column_name(k): Series( - v, copy=False, index=self.index, name=k, dtype=dtypes[k] + v, copy=False, index=self.index, name=k, dtype=dtype ).__finalize__(self) - for k, v in zip(self.columns, self._iter_column_arrays()) + for k, v, dtype in zip(self.columns, self._iter_column_arrays(), dtypes) if not isinstance(k, int) } @@ -640,7 +640,7 @@ def ndim(self) -> int: See Also -------- - ndarray.ndim : Number of array dimensions. + numpy.ndarray.ndim : Number of array dimensions. Examples -------- @@ -665,7 +665,7 @@ def size(self) -> int: See Also -------- - ndarray.size : Number of elements in the array. + numpy.ndarray.size : Number of elements in the array. Examples -------- @@ -838,7 +838,7 @@ def pop(self, item: Hashable) -> Series | Any: return result @final - def squeeze(self, axis: Axis | None = None): + def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame: """ Squeeze 1 dimensional axis objects into scalars. @@ -2014,9 +2014,24 @@ def empty(self) -> bool: def __array__( self, dtype: npt.DTypeLike | None = None, copy: bool | None = None ) -> np.ndarray: + if copy is False and not self._mgr.is_single_block and not self.empty: + # check this manually, otherwise ._values will already return a copy + # and np.array(values, copy=False) will not raise an error + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) values = self._values - arr = np.asarray(values, dtype=dtype) - if astype_is_view(values.dtype, arr.dtype) and self._mgr.is_single_block: + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.array(values, dtype=dtype, copy=copy) + + if ( + copy is not True + and astype_is_view(values.dtype, arr.dtype) + and self._mgr.is_single_block + ): # Check if both conversions can be done without a copy if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view( values.dtype, arr.dtype @@ -2196,8 +2211,9 @@ def to_excel( via the options ``io.excel.xlsx.writer`` or ``io.excel.xlsm.writer``. - merge_cells : bool, default True - Write MultiIndex and Hierarchical Rows as merged cells. + merge_cells : bool or 'columns', default False + If True, write MultiIndex index and columns as merged cells. + If 'columns', merge MultiIndex column cells only. {encoding_parameter} inf_rep : str, default 'inf' Representation for infinity (there is no native representation for @@ -3324,9 +3340,9 @@ def to_latex( r""" Render object to a LaTeX tabular, longtable, or nested table. - Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted + Requires ``\usepackage{booktabs}``. The output can be copy/pasted into a main LaTeX document or read from an external file - with ``\input{{table.tex}}``. + with ``\input{table.tex}``. .. versionchanged:: 2.0.0 Refactored to use the Styler implementation via jinja2 templating. @@ -3344,13 +3360,13 @@ def to_latex( Write row names (index). na_rep : str, default 'NaN' Missing data representation. - formatters : list of functions or dict of {{str: function}}, optional + formatters : list of functions or dict of {str: function}, optional Formatter functions to apply to columns' elements by position or name. The result of each function must be a unicode string. List must be of length equal to the number of columns. float_format : one-parameter function or str, optional, default None Formatter for floating point numbers. For example - ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will + ``float_format="%.2f"`` and ``float_format="{:0.2f}".format`` will both result in 0.1234 being formatted as 0.12. sparsify : bool, optional Set to False for a DataFrame with a hierarchical index to print @@ -3367,7 +3383,7 @@ def to_latex( columns of numbers, which default to 'r'. longtable : bool, optional Use a longtable environment instead of tabular. Requires - adding a \usepackage{{longtable}} to your LaTeX preamble. + adding a \usepackage{longtable} to your LaTeX preamble. By default, the value will be read from the pandas config module, and set to `True` if the option ``styler.latex.environment`` is `"longtable"`. @@ -3405,7 +3421,7 @@ def to_latex( default value to "r". multirow : bool, default True Use \multirow to enhance MultiIndex rows. Requires adding a - \usepackage{{multirow}} to your LaTeX preamble. Will print + \usepackage{multirow} to your LaTeX preamble. Will print centered labels (instead of top-aligned) across the contained rows, separating groups via clines. The default will be read from the pandas config module, and is set as the option @@ -3416,15 +3432,15 @@ def to_latex( default value to `True`. caption : str or tuple, optional Tuple (full_caption, short_caption), - which results in ``\caption[short_caption]{{full_caption}}``; + which results in ``\caption[short_caption]{full_caption}``; if a single string is passed, no short caption will be set. label : str, optional - The LaTeX label to be placed inside ``\label{{}}`` in the output. - This is used with ``\ref{{}}`` in the main ``.tex`` file. + The LaTeX label to be placed inside ``\label{}`` in the output. + This is used with ``\ref{}`` in the main ``.tex`` file. position : str, optional The LaTeX positional argument for tables, to be placed after - ``\begin{{}}`` in the output. + ``\begin{}`` in the output. Returns ------- @@ -3862,6 +3878,14 @@ def to_csv( >>> import os # doctest: +SKIP >>> os.makedirs("folder/subfolder", exist_ok=True) # doctest: +SKIP >>> df.to_csv("folder/subfolder/out.csv") # doctest: +SKIP + + Format floats to two decimal places: + + >>> df.to_csv("out1.csv", float_format="%.2f") # doctest: +SKIP + + Format floats using scientific notation: + + >>> df.to_csv("out2.csv", float_format="{{:.2e}}".format) # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() @@ -4860,7 +4884,8 @@ def sort_values( builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect a ``Series`` and return a Series with the same shape as the input. - It will be applied to each column in `by` independently. + It will be applied to each column in `by` independently. The values in the + returned Series will be used as the keys for sorting. Returns ------- @@ -7653,8 +7678,12 @@ def interpolate( * 'linear': Ignore the index and treat the values as equally spaced. This is the only method supported on MultiIndexes. * 'time': Works on daily and higher resolution data to interpolate - given length of interval. - * 'index', 'values': use the actual numerical values of the index. + given length of interval. This interpolates values based on + time interval between observations. + * 'index': The interpolation uses the numerical values + of the DataFrame's index to linearly calculate missing values. + * 'values': Interpolation based on the numerical values + in the DataFrame, treating them as equally spaced along the index. * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'polynomial': Passed to `scipy.interpolate.interp1d`, whereas 'spline' is passed to @@ -8009,7 +8038,9 @@ def asof(self, where, subset=None): np.nan, index=self.columns, name=where[0] ) - locs = self.index.asof_locs(where, ~(nulls._values)) + # error: Unsupported operand type for + # ~ ("ExtensionArray | ndarray[Any, Any] | Any") + locs = self.index.asof_locs(where, ~nulls._values) # type: ignore[operator] # mask the missing mask = locs == -1 diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index 8248f378e2c1a..ec477626a098f 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -8,8 +8,8 @@ __all__ = [ "DataFrameGroupBy", - "NamedAgg", - "SeriesGroupBy", "GroupBy", "Grouper", + "NamedAgg", + "SeriesGroupBy", ] diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index bad9749b5ecee..7699fb3d0f864 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -50,6 +50,7 @@ class OutputKey: "sem", "size", "skew", + "kurt", "std", "sum", "var", diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f076f8d79f104..1251403db6ff3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -32,6 +32,7 @@ Appender, Substitution, doc, + set_module, ) from pandas.util._exceptions import find_stack_level @@ -108,6 +109,7 @@ ScalarResult = TypeVar("ScalarResult") +@set_module("pandas") class NamedAgg(NamedTuple): """ Helper for column specific aggregation with control over output column names. @@ -142,6 +144,7 @@ class NamedAgg(NamedTuple): aggfunc: AggScalar +@set_module("pandas.api.typing") class SeriesGroupBy(GroupBy[Series]): def _wrap_agged_manager(self, mgr: Manager) -> Series: out = self.obj._constructor_from_mgr(mgr, axes=mgr.axes) @@ -580,6 +583,8 @@ def _wrap_applied_output( if is_transform: # GH#47787 see test_group_on_empty_multiindex res_index = data.index + elif not self.group_keys: + res_index = None else: res_index = self._grouper.result_index @@ -1267,13 +1272,86 @@ def skew( Name: Max Speed, dtype: float64 """ + return self._cython_agg_general( + "skew", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + + def kurt( + self, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> Series: + """ + Return unbiased kurtosis within groups. + + Parameters + ---------- + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series + Unbiased kurtosis within groups. + + See Also + -------- + Series.kurt : Return unbiased kurtosis over requested axis. + + Examples + -------- + >>> ser = pd.Series( + ... [390.0, 350.0, 357.0, 333.0, np.nan, 22.0, 20.0, 30.0, 40.0, 41.0], + ... index=[ + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... ], + ... name="Max Speed", + ... ) + >>> ser + Falcon 390.0 + Falcon 350.0 + Falcon 357.0 + Falcon 333.0 + Falcon NaN + Parrot 22.0 + Parrot 20.0 + Parrot 30.0 + Parrot 40.0 + Parrot 41.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).kurt() + Falcon 1.622109 + Parrot -2.878714 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).kurt(skipna=False) + Falcon NaN + Parrot -2.878714 + Name: Max Speed, dtype: float64 + """ + def alt(obj): # This should not be reached since the cython path should raise # TypeError and not NotImplementedError. - raise TypeError(f"'skew' is not supported for dtype={obj.dtype}") + raise TypeError(f"'kurt' is not supported for dtype={obj.dtype}") return self._cython_agg_general( - "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs + "kurt", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs ) @property @@ -1318,8 +1396,8 @@ def idxmin(self, skipna: bool = True) -> Series: Returns ------- - Index - Label of the minimum value. + Series + Indexes of minima in each group. Raises ------ @@ -1371,8 +1449,8 @@ def idxmax(self, skipna: bool = True) -> Series: Returns ------- - Index - Label of the maximum value. + Series + Indexes of maxima in each group. Raises ------ @@ -1440,6 +1518,11 @@ def is_monotonic_increasing(self) -> Series: ------- Series + See Also + -------- + SeriesGroupBy.is_monotonic_decreasing : Return whether each group's values + are monotonically decreasing. + Examples -------- >>> s = pd.Series([2, 1, 3, 4], index=["Falcon", "Falcon", "Parrot", "Parrot"]) @@ -1459,6 +1542,11 @@ def is_monotonic_decreasing(self) -> Series: ------- Series + See Also + -------- + SeriesGroupBy.is_monotonic_increasing : Return whether each group's values + are monotonically increasing. + Examples -------- >>> s = pd.Series([2, 1, 3, 4], index=["Falcon", "Falcon", "Parrot", "Parrot"]) @@ -1555,6 +1643,7 @@ def unique(self) -> Series: return result +@set_module("pandas.api.typing") class DataFrameGroupBy(GroupBy[DataFrame]): _agg_examples_doc = dedent( """ @@ -1953,6 +2042,8 @@ def _wrap_applied_output( if is_transform: # GH#47787 see test_group_on_empty_multiindex res_index = data.index + elif not self.group_keys: + res_index = None else: res_index = self._grouper.result_index @@ -2439,6 +2530,10 @@ def nunique(self, dropna: bool = True) -> DataFrame: nunique: DataFrame Counts of unique elements in each position. + See Also + -------- + DataFrame.nunique : Count number of distinct elements in specified axis. + Examples -------- >>> df = pd.DataFrame( @@ -2494,8 +2589,8 @@ def idxmax( Returns ------- - Series - Indexes of maxima in each group. + DataFrame + Indexes of maxima in each column according to the group. Raises ------ @@ -2505,6 +2600,7 @@ def idxmax( See Also -------- Series.idxmax : Return index of the maximum element. + DataFrame.idxmax : Indexes of maxima along the specified axis. Notes ----- @@ -2518,6 +2614,7 @@ def idxmax( ... { ... "consumption": [10.51, 103.11, 55.48], ... "co2_emissions": [37.2, 19.66, 1712], + ... "food_type": ["meat", "plant", "meat"], ... }, ... index=["Pork", "Wheat Products", "Beef"], ... ) @@ -2528,12 +2625,14 @@ def idxmax( Wheat Products 103.11 19.66 Beef 55.48 1712.00 - By default, it returns the index for the maximum value in each column. + By default, it returns the index for the maximum value in each column + according to the group. - >>> df.idxmax() - consumption Wheat Products - co2_emissions Beef - dtype: object + >>> df.groupby("food_type").idxmax() + consumption co2_emissions + food_type + animal Beef Beef + plant Wheat Products Wheat Products """ return self._idxmax_idxmin("idxmax", numeric_only=numeric_only, skipna=skipna) @@ -2556,8 +2655,8 @@ def idxmin( Returns ------- - Series - Indexes of minima in each group. + DataFrame + Indexes of minima in each column according to the group. Raises ------ @@ -2567,6 +2666,7 @@ def idxmin( See Also -------- Series.idxmin : Return index of the minimum element. + DataFrame.idxmin : Indexes of minima along the specified axis. Notes ----- @@ -2580,6 +2680,7 @@ def idxmin( ... { ... "consumption": [10.51, 103.11, 55.48], ... "co2_emissions": [37.2, 19.66, 1712], + ... "food_type": ["meat", "plant", "meat"], ... }, ... index=["Pork", "Wheat Products", "Beef"], ... ) @@ -2590,12 +2691,14 @@ def idxmin( Wheat Products 103.11 19.66 Beef 55.48 1712.00 - By default, it returns the index for the minimum value in each column. + By default, it returns the index for the minimum value in each column + according to the group. - >>> df.idxmin() - consumption Pork - co2_emissions Wheat Products - dtype: object + >>> df.groupby("food_type").idxmin() + consumption co2_emissions + food_type + animal Pork Pork + plant Wheat Products Wheat Products """ return self._idxmax_idxmin("idxmin", numeric_only=numeric_only, skipna=skipna) @@ -2891,6 +2994,111 @@ def alt(obj): "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs ) + def kurt( + self, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> DataFrame: + """ + Return unbiased kurtosis within groups. + + Parameters + ---------- + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + DataFrame + Unbiased kurtosis within groups. + + See Also + -------- + DataFrame.kurt : Return unbiased kurtosis over requested axis. + + Examples + -------- + >>> arrays = [ + ... [ + ... "falcon", + ... "parrot", + ... "cockatoo", + ... "kiwi", + ... "eagle", + ... "lion", + ... "monkey", + ... "rabbit", + ... "dog", + ... "wolf", + ... ], + ... [ + ... "bird", + ... "bird", + ... "bird", + ... "bird", + ... "bird", + ... "mammal", + ... "mammal", + ... "mammal", + ... "mammal", + ... "mammal", + ... ], + ... ] + >>> index = pd.MultiIndex.from_arrays(arrays, names=("name", "class")) + >>> df = pd.DataFrame( + ... { + ... "max_speed": [ + ... 389.0, + ... 24.0, + ... 70.0, + ... np.nan, + ... 350.0, + ... 80.5, + ... 21.5, + ... 15.0, + ... 40.0, + ... 50.0, + ... ] + ... }, + ... index=index, + ... ) + >>> df + max_speed + name class + falcon bird 389.0 + parrot bird 24.0 + cockatoo bird 70.0 + kiwi bird NaN + eagle bird 350.0 + lion mammal 80.5 + monkey mammal 21.5 + rabbit mammal 15.0 + dog mammal 40.0 + wolf mammal 50.0 + >>> gb = df.groupby(["class"]) + >>> gb.kurt() + max_speed + class + bird -5.493277 + mammal 0.204125 + >>> gb.kurt(skipna=False) + max_speed + class + bird NaN + mammal 0.204125 + """ + + return self._cython_agg_general( + "kurt", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + @property @doc(DataFrame.plot.__doc__) def plot(self) -> GroupByPlot: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 66db033596872..9c27df4ed8c1b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -214,6 +214,61 @@ class providing the base-class of operations. {example} """ +_groupby_agg_method_skipna_engine_template = """ +Compute {fname} of group values. + +Parameters +---------- +numeric_only : bool, default {no} + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + +min_count : int, default {mc} + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + +skipna : bool, default {s} + Exclude NA/null values. If the entire group is NA and ``skipna`` is + ``True``, the result will be NA. + + .. versionchanged:: 3.0.0 + +engine : str, default None {e} + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + +engine_kwargs : dict, default None {ek} + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to both the ``func`` and the ``apply`` groupby aggregation. + +Returns +------- +Series or DataFrame + Computed {fname} of values within each group. + +See Also +-------- +SeriesGroupBy.min : Return the min of the group values. +DataFrameGroupBy.min : Return the min of the group values. +SeriesGroupBy.max : Return the max of the group values. +DataFrameGroupBy.max : Return the max of the group values. +SeriesGroupBy.sum : Return the sum of the group values. +DataFrameGroupBy.sum : Return the sum of the group values. + +Examples +-------- +{example} +""" + _pipe_template = """ Apply a ``func`` with arguments to this %(klass)s object and return its result. @@ -435,6 +490,20 @@ def groups(self) -> dict[Hashable, Index]: """ Dict {group name -> group labels}. + This property provides a dictionary representation of the groupings formed + during a groupby operation, where each key represents a unique group value from + the specified column(s), and each value is a list of index labels + that belong to that group. + + See Also + -------- + core.groupby.DataFrameGroupBy.get_group : Retrieve group from a + ``DataFrameGroupBy`` object with provided name. + core.groupby.SeriesGroupBy.get_group : Retrieve group from a + ``SeriesGroupBy`` object with provided name. + core.resample.Resampler.get_group : Retrieve group from a + ``Resampler`` object with provided name. + Examples -------- @@ -501,6 +570,22 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """ Dict {group name -> group indices}. + The dictionary keys represent the group labels (e.g., timestamps for a + time-based resampling operation), and the values are arrays of integer + positions indicating where the elements of each group are located in the + original data. This property is particularly useful when working with + resampled data, as it provides insight into how the original time-series data + has been grouped. + + See Also + -------- + core.groupby.DataFrameGroupBy.indices : Provides a mapping of group rows to + positions of the elements. + core.groupby.SeriesGroupBy.indices : Provides a mapping of group rows to + positions of the elements. + core.resample.Resampler.indices : Provides a mapping of group rows to + positions of the elements. + Examples -------- @@ -692,7 +777,19 @@ def get_group(self, name) -> DataFrame | Series: Returns ------- - DataFrame or Series + Series or DataFrame + Get the respective Series or DataFrame corresponding to the group provided. + + See Also + -------- + DataFrameGroupBy.groups: Dictionary representation of the groupings formed + during a groupby operation. + DataFrameGroupBy.indices: Provides a mapping of group rows to positions + of the elements. + SeriesGroupBy.groups: Dictionary representation of the groupings formed + during a groupby operation. + SeriesGroupBy.indices: Provides a mapping of group rows to positions + of the elements. Examples -------- @@ -1358,7 +1455,7 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): # ----------------------------------------------------------------- # apply/agg/transform - def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: + def apply(self, func, *args, include_groups: bool = False, **kwargs) -> NDFrameT: """ Apply function ``func`` group-wise and combine the results together. @@ -1384,7 +1481,7 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: *args : tuple Optional positional arguments to pass to ``func``. - include_groups : bool, default True + include_groups : bool, default False When True, will attempt to apply ``func`` to the groupings in the case that they are columns of the DataFrame. If this raises a TypeError, the result will be computed with the groupings excluded. @@ -1392,10 +1489,9 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: .. versionadded:: 2.2.0 - .. deprecated:: 2.2.0 + .. versionchanged:: 3.0.0 - Setting include_groups to True is deprecated. Only the value - False will be allowed in a future version of pandas. + The default changed from True to False, and True is no longer allowed. **kwargs : dict Optional keyword arguments to pass to ``func``. @@ -1485,7 +1581,7 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: each group together into a Series, including setting the index as appropriate: - >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False) + >>> g1.apply(lambda x: x.C.max() - x.B.min()) A a 5 b 2 @@ -1494,11 +1590,13 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: Example 4: The function passed to ``apply`` returns ``None`` for one of the group. This group is filtered from the result: - >>> g1.apply(lambda x: None if x.iloc[0, 0] == 3 else x, include_groups=False) + >>> g1.apply(lambda x: None if x.iloc[0, 0] == 3 else x) B C 0 1 4 1 2 6 """ + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") if isinstance(func, str): if hasattr(self, func): res = getattr(self, func) @@ -1525,33 +1623,7 @@ def f(g): else: f = func - if not include_groups: - return self._python_apply_general(f, self._obj_with_exclusions) - - try: - result = self._python_apply_general(f, self._selected_obj) - if ( - not isinstance(self.obj, Series) - and self._selection is None - and self._selected_obj.shape != self._obj_with_exclusions.shape - ): - warnings.warn( - message=_apply_groupings_depr.format(type(self).__name__, "apply"), - category=DeprecationWarning, - stacklevel=find_stack_level(), - ) - except TypeError: - # gh-20949 - # try again, with .apply acting as a filtering - # operation, by excluding the grouping column - # This would normally not be triggered - # except if the udf is trying an operation that - # fails on *some* columns, e.g. a numeric operation - # on a string grouper column - - return self._python_apply_general(f, self._obj_with_exclusions) - - return result + return self._python_apply_general(f, self._obj_with_exclusions) @final def _python_apply_general( @@ -2081,6 +2153,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: def mean( self, numeric_only: bool = False, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -2096,6 +2169,12 @@ def mean( numeric_only no longer accepts ``None`` and defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + engine : str, default None * ``'cython'`` : Runs the operation through C-extensions from cython. * ``'numba'`` : Runs the operation through JIT compiled code from numba. @@ -2162,17 +2241,21 @@ def mean( executor.float_dtype_mapping, engine_kwargs, min_periods=0, + skipna=skipna, ) else: result = self._cython_agg_general( "mean", - alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).mean( + numeric_only=numeric_only, skipna=skipna + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @final - def median(self, numeric_only: bool = False) -> NDFrameT: + def median(self, numeric_only: bool = False, skipna: bool = True) -> NDFrameT: """ Compute median of groups, excluding missing values. @@ -2187,6 +2270,12 @@ def median(self, numeric_only: bool = False) -> NDFrameT: numeric_only no longer accepts ``None`` and defaults to False. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2259,8 +2348,11 @@ def median(self, numeric_only: bool = False) -> NDFrameT: """ result = self._cython_agg_general( "median", - alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).median( + numeric_only=numeric_only, skipna=skipna + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @@ -2273,6 +2365,7 @@ def std( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute standard deviation of groups, excluding missing values. @@ -2311,6 +2404,12 @@ def std( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2365,14 +2464,16 @@ def std( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) ) else: return self._cython_agg_general( "std", - alt=lambda x: Series(x, copy=False).std(ddof=ddof), + alt=lambda x: Series(x, copy=False).std(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2384,6 +2485,7 @@ def var( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute variance of groups, excluding missing values. @@ -2421,6 +2523,12 @@ def var( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2474,13 +2582,15 @@ def var( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) else: return self._cython_agg_general( "var", - alt=lambda x: Series(x, copy=False).var(ddof=ddof), + alt=lambda x: Series(x, copy=False).var(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2610,7 +2720,9 @@ def _value_counts( return result.__finalize__(self.obj, method="value_counts") @final - def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: + def sem( + self, ddof: int = 1, numeric_only: bool = False, skipna: bool = True + ) -> NDFrameT: """ Compute standard error of the mean of groups, excluding missing values. @@ -2630,11 +2742,22 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame Standard error of the mean of values within each group. + See Also + -------- + DataFrame.sem : Return unbiased standard error of the mean over requested axis. + Series.sem : Return unbiased standard error of the mean over requested axis. + Examples -------- For SeriesGroupBy: @@ -2699,9 +2822,10 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: ) return self._cython_agg_general( "sem", - alt=lambda x: Series(x, copy=False).sem(ddof=ddof), + alt=lambda x: Series(x, copy=False).sem(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2802,10 +2926,11 @@ def size(self) -> DataFrame | Series: @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="sum", no=False, mc=0, + s=True, e=None, ek=None, example=dedent( @@ -2847,6 +2972,7 @@ def sum( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -2858,6 +2984,7 @@ def sum( executor.default_dtype_mapping, engine_kwargs, min_periods=min_count, + skipna=skipna, ) else: # If we are grouping on categoricals we want unobserved categories to @@ -2869,12 +2996,15 @@ def sum( min_count=min_count, alias="sum", npfunc=np.sum, + skipna=skipna, ) return result @final - def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: + def prod( + self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True + ) -> NDFrameT: """ Compute prod of group values. @@ -2891,6 +3021,12 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2939,17 +3075,22 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: 2 30 72 """ return self._agg_general( - numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + alias="prod", + npfunc=np.prod, ) @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="min", no=False, mc=-1, e=None, ek=None, + s=True, example=dedent( """\ For SeriesGroupBy: @@ -2989,6 +3130,7 @@ def min( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3001,23 +3143,26 @@ def min( engine_kwargs, min_periods=min_count, is_max=False, + skipna=skipna, ) else: return self._agg_general( numeric_only=numeric_only, min_count=min_count, + skipna=skipna, alias="min", npfunc=np.min, ) @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="max", no=False, mc=-1, e=None, ek=None, + s=True, example=dedent( """\ For SeriesGroupBy: @@ -3057,6 +3202,7 @@ def max( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3069,11 +3215,13 @@ def max( engine_kwargs, min_periods=min_count, is_max=True, + skipna=skipna, ) else: return self._agg_general( numeric_only=numeric_only, min_count=min_count, + skipna=skipna, alias="max", npfunc=np.max, ) @@ -3384,7 +3532,9 @@ def describe( return result @final - def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler: + def resample( + self, rule, *args, include_groups: bool = False, **kwargs + ) -> Resampler: """ Provide resampling when using a TimeGrouper. @@ -3409,10 +3559,9 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp .. versionadded:: 2.2.0 - .. deprecated:: 2.2.0 + .. versionchanged:: 3.0 - Setting include_groups to True is deprecated. Only the value - False will be allowed in a future version of pandas. + The default was changed to False, and True is no longer allowed. **kwargs Possible arguments are `how`, `fill_method`, `limit`, `kind` and @@ -3445,7 +3594,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Downsample the DataFrame into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> df.groupby("a").resample("3min", include_groups=False).sum() + >>> df.groupby("a").resample("3min").sum() b a 0 2000-01-01 00:00:00 2 @@ -3454,7 +3603,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Upsample the series into 30 second bins. - >>> df.groupby("a").resample("30s", include_groups=False).sum() + >>> df.groupby("a").resample("30s").sum() b a 0 2000-01-01 00:00:00 1 @@ -3468,7 +3617,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Resample by month. Values are assigned to the month of the period. - >>> df.groupby("a").resample("ME", include_groups=False).sum() + >>> df.groupby("a").resample("ME").sum() b a 0 2000-01-31 3 @@ -3477,11 +3626,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Downsample the series into 3 minute bins as above, but close the right side of the bin interval. - >>> ( - ... df.groupby("a") - ... .resample("3min", closed="right", include_groups=False) - ... .sum() - ... ) + >>> (df.groupby("a").resample("3min", closed="right").sum()) b a 0 1999-12-31 23:57:00 1 @@ -3492,11 +3637,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp the bin interval, but label each bin using the right edge instead of the left. - >>> ( - ... df.groupby("a") - ... .resample("3min", closed="right", label="right", include_groups=False) - ... .sum() - ... ) + >>> (df.groupby("a").resample("3min", closed="right", label="right").sum()) b a 0 2000-01-01 00:00:00 1 @@ -3505,11 +3646,10 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp """ from pandas.core.resample import get_resampler_for_grouping - # mypy flags that include_groups could be specified via `*args` or `**kwargs` - # GH#54961 would resolve. - return get_resampler_for_grouping( # type: ignore[misc] - self, rule, *args, include_groups=include_groups, **kwargs - ) + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") + + return get_resampler_for_grouping(self, rule, *args, **kwargs) @final def rolling( @@ -3577,14 +3717,21 @@ def rolling( an integer index is not used to calculate the rolling window. closed : str, default None - If ``'right'``, the first point in the window is excluded from calculations. + Determines the inclusivity of points in the window + If ``'right'``, (First, Last] the last point in the window + is included in the calculations. - If ``'left'``, the last point in the window is excluded from calculations. + If ``'left'``, [First, Last) the first point in the window + is included in the calculations. - If ``'both'``, no points in the window are excluded from calculations. + If ``'both'``, [First, Last] all points in the window + are included in the calculations. - If ``'neither'``, the first and last points in the window are excluded - from calculations. + If ``'neither'``, (First, Last) the first and last points + in the window are excludedfrom calculations. + + () and [] are referencing open and closed set + notation respetively. Default ``None`` (``'right'``). @@ -3969,19 +4116,6 @@ def nth(self) -> GroupByNthSelector: 'all' or 'any'; this is equivalent to calling dropna(how=dropna) before the groupby. - Parameters - ---------- - n : int, slice or list of ints and slices - A single nth value for the row or a list of nth values or slices. - - .. versionchanged:: 1.4.0 - Added slice and lists containing slices. - Added index notation. - - dropna : {'any', 'all', None}, default None - Apply the specified dropna operation before counting which row is - the nth row. Only supported if n is an int. - Returns ------- Series or DataFrame @@ -4162,9 +4296,9 @@ def quantile( starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups) def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: - if is_object_dtype(vals.dtype): + if isinstance(vals.dtype, StringDtype) or is_object_dtype(vals.dtype): raise TypeError( - "'quantile' cannot be performed against 'object' dtypes!" + f"dtype '{vals.dtype}' does not support operation 'quantile'" ) inference: DtypeObj | None = None @@ -5534,13 +5668,3 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) return mi - - -# GH#7155 -_apply_groupings_depr = ( - "{}.{} operated on the grouping columns. This behavior is deprecated, " - "and in a future version of pandas the grouping columns will be excluded " - "from the operation. Either pass `include_groups=False` to exclude the " - "groupings or explicitly select the grouping columns after groupby to silence " - "this warning." -) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4c7fe604e452d..c4c7f73ee166c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -144,6 +144,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "std": functools.partial(libgroupby.group_var, name="std"), "sem": functools.partial(libgroupby.group_var, name="sem"), "skew": "group_skew", + "kurt": "group_kurt", "first": "group_nth", "last": "group_last", "ohlc": "group_ohlc", @@ -193,7 +194,7 @@ def _get_cython_function( elif how in ["std", "sem", "idxmin", "idxmax"]: # We have a partial object that does not have __signatures__ return f - elif how == "skew": + elif how in ["skew", "kurt"]: # _get_cython_vals will convert to float64 pass elif "object" not in f.__signatures__: @@ -224,7 +225,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: """ how = self.how - if how in ["median", "std", "sem", "skew"]: + if how in ["median", "std", "sem", "skew", "kurt"]: # median only has a float64 implementation # We should only get here with is_numeric, as non-numeric cases # should raise in _get_cython_function @@ -453,7 +454,7 @@ def _call_cython_op( **kwargs, ) result = result.astype(bool, copy=False) - elif self.how in ["skew"]: + elif self.how in ["skew", "kurt"]: func( out=result, counts=counts, @@ -1021,6 +1022,7 @@ def apply_groupwise( # getattr pattern for __name__ is needed for functools.partial objects if len(group_keys) == 0 and getattr(f, "__name__", None) in [ "skew", + "kurt", "sum", "prod", ]: diff --git a/pandas/core/indexers/__init__.py b/pandas/core/indexers/__init__.py index ba8a4f1d0ee7a..036b32b3feac2 100644 --- a/pandas/core/indexers/__init__.py +++ b/pandas/core/indexers/__init__.py @@ -15,17 +15,17 @@ ) __all__ = [ - "is_valid_positional_slice", + "check_array_indexer", + "check_key_length", + "check_setitem_lengths", + "disallow_ndim_indexing", + "is_empty_indexer", "is_list_like_indexer", "is_scalar_indexer", - "is_empty_indexer", - "check_setitem_lengths", - "validate_indices", - "maybe_convert_indices", + "is_valid_positional_slice", "length_of_indexer", - "disallow_ndim_indexing", + "maybe_convert_indices", "unpack_1tuple", - "check_key_length", - "check_array_indexer", "unpack_tuple_and_ellipses", + "validate_indices", ] diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index e2dc71f68a65b..c404323a1168c 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -373,6 +373,28 @@ def to_pydatetime(self) -> Series: @property def freq(self): + """ + Tries to return a string representing a frequency generated by infer_freq. + + Returns None if it can't autodetect the frequency. + + See Also + -------- + Series.dt.to_period : Cast to PeriodArray/PeriodIndex at a particular + frequency. + + Examples + -------- + >>> ser = pd.Series(["2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04"]) + >>> ser = pd.to_datetime(ser) + >>> ser.dt.freq + 'D' + + >>> ser = pd.Series(["2022-01-01", "2024-01-01", "2026-01-01", "2028-01-01"]) + >>> ser = pd.to_datetime(ser) + >>> ser.dt.freq + '2YS-JAN' + """ return self._get_values().inferred_freq def isocalendar(self) -> DataFrame: diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 5144e647e73b4..058e584336905 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -37,26 +37,26 @@ __all__ = [ - "Index", - "MultiIndex", "CategoricalIndex", + "DatetimeIndex", + "Index", "IntervalIndex", - "RangeIndex", "InvalidIndexError", - "TimedeltaIndex", + "MultiIndex", + "NaT", "PeriodIndex", - "DatetimeIndex", + "RangeIndex", + "TimedeltaIndex", "_new_Index", - "NaT", + "all_indexes_same", + "default_index", "ensure_index", "ensure_index_from_sequences", "get_objs_combined_axis", - "union_indexes", "get_unanimous_names", - "all_indexes_same", - "default_index", - "safe_sort_index", "maybe_sequence_to_range", + "safe_sort_index", + "union_indexes", ] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 749a5fea4d513..e2f9c5e9868a9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -71,6 +71,7 @@ Appender, cache_readonly, doc, + set_module, ) from pandas.util._exceptions import ( find_stack_level, @@ -315,6 +316,7 @@ def _new_Index(cls, d): return cls.__new__(cls, **d) +@set_module("pandas") class Index(IndexOpsMixin, PandasObject): """ Immutable sequence used for indexing and alignment. @@ -874,7 +876,7 @@ def _engine( # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): - return libindex.StringEngine(target_values) + return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr] # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" @@ -908,7 +910,11 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray: """ The array interface, return my values. """ - return np.asarray(self._data, dtype=dtype) + if copy is None: + # Note, that the if branch exists for NumPy 1.x support + return np.asarray(self._data, dtype=dtype) + + return np.array(self._data, dtype=dtype, copy=copy) def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs): @@ -5135,7 +5141,9 @@ def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display. """ - return is_object_dtype(self.dtype) + return is_object_dtype(self.dtype) or ( + is_string_dtype(self.dtype) and self.dtype.storage == "python" # type: ignore[union-attr] + ) def __contains__(self, key: Any) -> bool: """ @@ -5340,7 +5348,7 @@ def putmask(self, mask, value) -> Index: See Also -------- - numpy.ndarray.putmask : Changes elements of an array + numpy.putmask : Changes elements of an array based on conditional and input values. Examples @@ -5966,7 +5974,6 @@ def _should_fallback_to_positional(self) -> bool: def get_indexer_non_unique( self, target ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - target = ensure_index(target) target = self._maybe_cast_listlike_indexer(target) if not self._should_compare(target) and not self._should_partial_index(target): @@ -6548,7 +6555,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index: """ Analogue to maybe_cast_indexer for get_indexer instead of get_loc. """ - return ensure_index(target) + target_index = ensure_index(target) + if ( + not hasattr(target, "dtype") + and self.dtype == object + and target_index.dtype == "string" + ): + # If we started with a list-like, avoid inference to string dtype if self + # is object dtype (coercing to string dtype will alter the missing values) + target_index = Index(target, dtype=self.dtype) + return target_index @final def _validate_indexer( diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 312219eb7b91a..d20a84449fb85 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -13,6 +13,7 @@ from pandas.util._decorators import ( cache_readonly, doc, + set_module, ) from pandas.core.dtypes.common import is_scalar @@ -76,6 +77,7 @@ Categorical, wrap=True, ) +@set_module("pandas") class CategoricalIndex(NDArrayBackedExtensionIndex): """ Index based on an underlying :class:`Categorical`. diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 536f22d38468d..9adbaadbdcdc8 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -26,6 +26,7 @@ from pandas.util._decorators import ( cache_readonly, doc, + set_module, ) from pandas.core.dtypes.common import is_scalar @@ -126,6 +127,7 @@ def _new_DatetimeIndex(cls, d): + DatetimeArray._bool_ops, DatetimeArray, ) +@set_module("pandas") class DatetimeIndex(DatetimeTimedeltaMixin): """ Immutable ndarray-like of datetime64 data. @@ -814,6 +816,7 @@ def indexer_between_time( return mask.nonzero()[0] +@set_module("pandas") def date_range( start=None, end=None, @@ -1018,6 +1021,7 @@ def date_range( return DatetimeIndex._simple_new(dtarr, name=name) +@set_module("pandas") def bdate_range( start=None, end=None, diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index c559c529586b5..254bd71ade209 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -110,7 +110,9 @@ def _disabled(self, *args, **kwargs) -> NoReturn: raise TypeError(f"'{type(self).__name__}' does not support mutable operations.") def __str__(self) -> str: - return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n")) + return pprint_thing( + self, quote_strings=True, escape_chars=("\t", "\r", "\n", "'") + ) def __repr__(self) -> str: return f"{type(self).__name__}({self!s})" diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 94717141b30b0..13811c28e6c1e 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -32,6 +32,7 @@ from pandas.util._decorators import ( Appender, cache_readonly, + set_module, ) from pandas.util._exceptions import rewrite_exception @@ -202,6 +203,7 @@ def _new_IntervalIndex(cls, d): IntervalArray, ) @inherit_names(["is_non_overlapping_monotonic", "closed"], IntervalArray, cache=True) +@set_module("pandas") class IntervalIndex(ExtensionIndex): _typ = "intervalindex" @@ -556,8 +558,7 @@ def _maybe_convert_i8(self, key): left = self._maybe_convert_i8(key.left) right = self._maybe_convert_i8(key.right) constructor = Interval if scalar else IntervalIndex.from_arrays - # error: "object" not callable - return constructor(left, right, closed=self.closed) # type: ignore[operator] + return constructor(left, right, closed=self.closed) if scalar: # Timestamp/Timedelta diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ae9b272af9fe9..dc48cd1ed958e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -53,6 +53,7 @@ Appender, cache_readonly, doc, + set_module, ) from pandas.util._exceptions import find_stack_level @@ -66,6 +67,7 @@ is_list_like, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -194,6 +196,7 @@ def new_meth(self_or_cls, *args, **kwargs): return cast(F, new_meth) +@set_module("pandas") class MultiIndex(Index): """ A multi-level, or hierarchical, index object for pandas objects. @@ -1391,6 +1394,15 @@ def copy( # type: ignore[override] def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" + if copy is False: + # self.values is always a newly construct array, so raise. + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if copy is True: + # explicit np.array call to ensure a copy is made and unique objects + # are returned, because self.values is cached + return np.array(self.values, dtype=dtype) return self.values def view(self, cls=None) -> Self: @@ -1416,10 +1428,12 @@ def dtype(self) -> np.dtype: def _is_memory_usage_qualified(self) -> bool: """return a boolean if we need a qualified .info display""" - def f(level) -> bool: - return "mixed" in level or "string" in level or "unicode" in level + def f(dtype) -> bool: + return is_object_dtype(dtype) or ( + is_string_dtype(dtype) and dtype.storage == "python" + ) - return any(f(level.inferred_type) for level in self.levels) + return any(f(level.dtype) for level in self.levels) # Cannot determine type of "memory_usage" @doc(Index.memory_usage) # type: ignore[has-type] @@ -4070,11 +4084,10 @@ def insert(self, loc: int, item) -> MultiIndex: # have to insert into level # must insert at end otherwise you have to recompute all the # other codes - if isna(k): # GH 59003 + lev_loc = len(level) + level = level.insert(lev_loc, k) + if isna(level[lev_loc]): # GH 59003, 60388 lev_loc = -1 - else: - lev_loc = len(level) - level = level.insert(lev_loc, k) else: lev_loc = level.get_loc(k) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 377406e24b1d3..0a7a0319bed3a 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -20,6 +20,7 @@ from pandas.util._decorators import ( cache_readonly, doc, + set_module, ) from pandas.core.dtypes.common import is_integer @@ -81,6 +82,7 @@ def _new_PeriodIndex(cls, **d): wrap=True, ) @inherit_names(["is_leap_year"], PeriodArray) +@set_module("pandas") class PeriodIndex(DatetimeIndexOpsMixin): """ Immutable ndarray holding ordinal values indicating regular periods in time. diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index dc96d1c11db74..2db50bbbdfa37 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -27,6 +27,7 @@ from pandas.util._decorators import ( cache_readonly, doc, + set_module, ) from pandas.core.dtypes.base import ExtensionDtype @@ -74,6 +75,7 @@ def min_fitting_element(start: int, step: int, lower_limit: int) -> int: return start + abs(step) * no_steps +@set_module("pandas") class RangeIndex(Index): """ Immutable Index implementing a monotonic integer range. @@ -188,10 +190,31 @@ def from_range(cls, data: range, name=None, dtype: Dtype | None = None) -> Self: """ Create :class:`pandas.RangeIndex` from a ``range`` object. + This method provides a way to create a :class:`pandas.RangeIndex` directly + from a Python ``range`` object. The resulting :class:`RangeIndex` will have + the same start, stop, and step values as the input ``range`` object. + It is particularly useful for constructing indices in an efficient and + memory-friendly manner. + + Parameters + ---------- + data : range + The range object to be converted into a RangeIndex. + name : str, default None + Name to be stored in the index. + dtype : Dtype or None + Data type for the RangeIndex. If None, the default integer type will + be used. + Returns ------- RangeIndex + See Also + -------- + RangeIndex : Immutable Index implementing a monotonic integer range. + Index : Immutable sequence used for indexing and alignment. + Examples -------- >>> pd.RangeIndex.from_range(range(5)) @@ -1193,7 +1216,7 @@ def _getitem_slice(self, slobj: slice) -> Self: @unpack_zerodim_and_defer("__floordiv__") def __floordiv__(self, other): if is_integer(other) and other != 0: - if len(self) == 0 or self.start % other == 0 and self.step % other == 0: + if len(self) == 0 or (self.start % other == 0 and self.step % other == 0): start = self.start // other step = self.step // other stop = start + len(self) * step diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 29039ffd0217e..fa3de46621643 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -13,6 +13,7 @@ Timedelta, to_offset, ) +from pandas.util._decorators import set_module from pandas.core.dtypes.common import ( is_scalar, @@ -50,6 +51,7 @@ ], TimedeltaArray, ) +@set_module("pandas") class TimedeltaIndex(DatetimeTimedeltaMixin): """ Immutable Index of timedelta64 data. @@ -235,6 +237,7 @@ def inferred_type(self) -> str: return "timedelta64" +@set_module("pandas") def timedelta_range( start=None, end=None, diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 08bd3cde60806..656ee54cbc5d4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -160,7 +160,7 @@ def iloc(self) -> _iLocIndexer: .. versionchanged:: 3.0 - Returning a tuple from a callable is deprecated. + Callables which return a tuple are deprecated as input. ``.iloc[]`` is primarily integer position based (from ``0`` to ``length-1`` of the axis), but may also be used with a boolean @@ -212,7 +212,7 @@ def iloc(self) -> _iLocIndexer: With a scalar integer. >>> type(df.iloc[0]) - + >>> df.iloc[0] a 1 b 2 @@ -914,7 +914,9 @@ def __setitem__(self, key, value) -> None: indexer = self._get_setitem_indexer(key) self._has_valid_setitem_indexer(key) - iloc = self if self.name == "iloc" else self.obj.iloc + iloc: _iLocIndexer = ( + cast("_iLocIndexer", self) if self.name == "iloc" else self.obj.iloc + ) iloc._setitem_with_indexer(indexer, value, self.name) def _validate_key(self, key, axis: AxisInt) -> None: @@ -1237,8 +1239,10 @@ def _validate_key(self, key, axis: Axis) -> None: if isinstance(key, bool) and not ( is_bool_dtype(ax.dtype) or ax.dtype.name == "boolean" - or isinstance(ax, MultiIndex) - and is_bool_dtype(ax.get_level_values(0).dtype) + or ( + isinstance(ax, MultiIndex) + and is_bool_dtype(ax.get_level_values(0).dtype) + ) ): raise KeyError( f"{key}: boolean label can not be used without a boolean index" @@ -2118,7 +2122,7 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: is_full_setter = com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)) - is_null_setter = com.is_empty_slice(pi) or is_array_like(pi) and len(pi) == 0 + is_null_setter = com.is_empty_slice(pi) or (is_array_like(pi) and len(pi) == 0) if is_null_setter: # no-op, don't cast dtype later @@ -2742,19 +2746,15 @@ def check_dict_or_set_indexers(key) -> None: """ Check if the indexer is or contains a dict or set, which is no longer allowed. """ - if ( - isinstance(key, set) - or isinstance(key, tuple) - and any(isinstance(x, set) for x in key) + if isinstance(key, set) or ( + isinstance(key, tuple) and any(isinstance(x, set) for x in key) ): raise TypeError( "Passing a set as an indexer is not supported. Use a list instead." ) - if ( - isinstance(key, dict) - or isinstance(key, tuple) - and any(isinstance(x, dict) for x in key) + if isinstance(key, dict) or ( + isinstance(key, tuple) and any(isinstance(x, dict) for x in key) ): raise TypeError( "Passing a dict as an indexer is not supported. Use a list instead." diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 0e5776ae8cdd9..b990eca39b3dd 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -9,6 +9,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -39,7 +41,9 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: .. note:: For new development, we highly recommend using the Arrow C Data Interface - alongside the Arrow PyCapsule Interface instead of the interchange protocol + alongside the Arrow PyCapsule Interface instead of the interchange protocol. + From pandas 3.0 onwards, `from_dataframe` uses the PyCapsule Interface, + only falling back to the interchange protocol if that fails. .. warning:: @@ -88,6 +92,18 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: if isinstance(df, pd.DataFrame): return df + if hasattr(df, "__arrow_c_stream__"): + try: + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + except ImportError: + # fallback to _from_dataframe + pass + else: + try: + return pa.table(df).to_pandas(zero_copy_only=not allow_copy) + except pa.ArrowInvalid as e: + raise RuntimeError(e) from e + if not hasattr(df, "__dataframe__"): raise ValueError("`df` does not support __dataframe__") @@ -147,8 +163,6 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: ------- pd.DataFrame """ - # We need a dict of columns here, with each column being a NumPy array (at - # least for now, deal with non-NumPy dtypes later). columns: dict[str, Any] = {} buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): @@ -347,8 +361,12 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: # Add to our list of strings str_list[i] = string - # Convert the string list to a NumPy array - return np.asarray(str_list, dtype="object"), buffers + if using_string_dtype(): + res = pd.Series(str_list, dtype="str") + else: + res = np.asarray(str_list, dtype="object") # type: ignore[assignment] + + return res, buffers # type: ignore[return-value] def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray: diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 45758379e0bd6..202bebde88c2c 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -6,8 +6,52 @@ ) __all__ = [ - "make_block", + "Block", "BlockManager", + "ExtensionBlock", "SingleBlockManager", "concatenate_managers", + "make_block", ] + + +def __getattr__(name: str): + # GH#55139 + import warnings + + if name == "create_block_manager_from_blocks": + # GH#33892 + warnings.warn( + f"{name} is deprecated and will be removed in a future version. " + "Use public APIs instead.", + FutureWarning, + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, + ) + from pandas.core.internals.managers import create_block_manager_from_blocks + + return create_block_manager_from_blocks + + if name in [ + "Block", + "ExtensionBlock", + ]: + warnings.warn( + f"{name} is deprecated and will be removed in a future version. " + "Use public APIs instead.", + FutureWarning, + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, + ) + if name == "ExtensionBlock": + from pandas.core.internals.blocks import ExtensionBlock + + return ExtensionBlock + else: + from pandas.core.internals.blocks import Block + + return Block + + raise AttributeError(f"module 'pandas.core.internals' has no attribute '{name}'") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a3ff577966a6d..f44ad926dda5c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -77,6 +77,7 @@ ABCNumpyExtensionArray, ABCSeries, ) +from pandas.core.dtypes.inference import is_re from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, @@ -107,6 +108,7 @@ PeriodArray, TimedeltaArray, ) +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.computation import expressions @@ -512,9 +514,8 @@ def convert(self) -> list[Block]: convert_non_numeric=True, ) refs = None - if ( - res_values is values - or isinstance(res_values, NumpyExtensionArray) + if res_values is values or ( + isinstance(res_values, NumpyExtensionArray) and res_values._ndarray is values ): refs = self.refs @@ -706,7 +707,7 @@ def replace( # bc _can_hold_element is incorrect. return [self.copy(deep=False)] - elif self._can_hold_element(value): + elif self._can_hold_element(value) or (self.dtype == "string" and is_re(value)): # TODO(CoW): Maybe split here as well into columns where mask has True # and rest? blk = self._maybe_copy(inplace) @@ -766,14 +767,24 @@ def _replace_regex( ------- List[Block] """ - if not self._can_hold_element(to_replace): + if not is_re(to_replace) and not self._can_hold_element(to_replace): # i.e. only if self.is_object is True, but could in principle include a # String ExtensionBlock return [self.copy(deep=False)] - rx = re.compile(to_replace) + if is_re(to_replace) and self.dtype not in [object, "string"]: + # only object or string dtype can hold strings, and a regex object + # will only match strings + return [self.copy(deep=False)] - block = self._maybe_copy(inplace) + if not ( + self._can_hold_element(value) or (self.dtype == "string" and is_re(value)) + ): + block = self.astype(np.dtype(object)) + else: + block = self._maybe_copy(inplace) + + rx = re.compile(to_replace) replace_regex(block.values, rx, value, mask) return [block] @@ -793,7 +804,9 @@ def replace_list( # Exclude anything that we know we won't contain pairs = [ - (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) + (x, y) + for x, y in zip(src_list, dest_list) + if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x))) ] if not len(pairs): return [self.copy(deep=False)] @@ -1323,7 +1336,7 @@ def fillna( return [self.copy(deep=False)] if limit is not None: - mask[mask.cumsum(self.ndim - 1) > limit] = False + mask[mask.cumsum(self.values.ndim - 1) > limit] = False if inplace: nbs = self.putmask(mask.T, value) @@ -1671,9 +1684,16 @@ def where(self, other, cond) -> list[Block]: res_values = arr._where(cond, other).T except (ValueError, TypeError): if self.ndim == 1 or self.shape[0] == 1: - if isinstance(self.dtype, IntervalDtype): + if isinstance(self.dtype, (IntervalDtype, StringDtype)): # TestSetitemFloatIntervalWithIntIntervalValues blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False) + if ( + self.ndim == 2 + and isinstance(orig_cond, np.ndarray) + and orig_cond.ndim == 1 + and not is_1d_only_ea_dtype(blk.dtype) + ): + orig_cond = orig_cond[:, None] return blk.where(orig_other, orig_cond) elif isinstance(self, NDArrayBackedExtensionBlock): @@ -1841,9 +1861,9 @@ def fillna( limit: int | None = None, inplace: bool = False, ) -> list[Block]: - if isinstance(self.dtype, IntervalDtype): + if isinstance(self.dtype, (IntervalDtype, StringDtype)): # Block.fillna handles coercion (test_fillna_interval) - if limit is not None: + if isinstance(self.dtype, IntervalDtype) and limit is not None: raise ValueError("limit must be None") return super().fillna( value=value, diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 959e572b2b35b..dfff34656f82b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -258,7 +258,7 @@ def ndarray_to_mgr( # and a subsequent `astype` will not already result in a copy values = np.array(values, copy=True, order="F") else: - values = np.array(values, copy=False) + values = np.asarray(values) values = _ensure_2d(values) else: @@ -417,8 +417,7 @@ def dict_to_mgr( else x.copy(deep=True) if ( isinstance(x, Index) - or isinstance(x, ABCSeries) - and is_1d_only_ea_dtype(x.dtype) + or (isinstance(x, ABCSeries) and is_1d_only_ea_dtype(x.dtype)) ) else x for x in arrays @@ -966,8 +965,9 @@ def convert(arr): if dtype is None: if arr.dtype == np.dtype("O"): # i.e. maybe_convert_objects didn't convert - arr = maybe_infer_to_datetimelike(arr) - if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): + convert_to_nullable_dtype = dtype_backend != "numpy" + arr = maybe_infer_to_datetimelike(arr, convert_to_nullable_dtype) + if convert_to_nullable_dtype and arr.dtype == np.dtype("O"): new_dtype = StringDtype() arr_cls = new_dtype.construct_array_type() arr = arr_cls._from_sequence(arr, dtype=new_dtype) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 039d868bccd16..ff2daae002731 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -413,13 +413,10 @@ def func(yvalues: np.ndarray) -> None: **kwargs, ) - # error: Argument 1 to "apply_along_axis" has incompatible type - # "Callable[[ndarray[Any, Any]], None]"; expected "Callable[..., - # Union[_SupportsArray[dtype[]], Sequence[_SupportsArray - # [dtype[]]], Sequence[Sequence[_SupportsArray[dtype[]]]], - # Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]], - # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]]]]]" - np.apply_along_axis(func, axis, data) # type: ignore[arg-type] + # error: No overload variant of "apply_along_axis" matches + # argument types "Callable[[ndarray[Any, Any]], None]", + # "int", "ndarray[Any, Any]" + np.apply_along_axis(func, axis, data) # type: ignore[call-overload] def _index_to_interp_indices(index: Index, method: str) -> np.ndarray: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e775156a6ae2f..d6154e2352c63 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -726,7 +726,9 @@ def nanmean( @bottleneck_switch() -def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=None): +def nanmedian( + values: np.ndarray, *, axis: AxisInt | None = None, skipna: bool = True, mask=None +) -> float | np.ndarray: """ Parameters ---------- @@ -738,7 +740,7 @@ def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask= Returns ------- - result : float + result : float | ndarray Unless input is a float array, in which case use the same precision as the input array. @@ -758,7 +760,7 @@ def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask= # cases we never need to set NaN to the masked values using_nan_sentinel = values.dtype.kind == "f" and mask is None - def get_median(x, _mask=None): + def get_median(x: np.ndarray, _mask=None): if _mask is None: _mask = notna(x) else: @@ -794,6 +796,8 @@ def get_median(x, _mask=None): notempty = values.size + res: float | np.ndarray + # an array from a frame if values.ndim > 1 and axis is not None: # there's a non-empty array to apply over otherwise numpy raises diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 34a0bb1f45e2c..9f9d69a182f72 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -66,15 +66,18 @@ __all__ = [ "ARITHMETIC_BINOPS", "arithmetic_op", - "comparison_op", "comp_method_OBJECT_ARRAY", - "invalid_comparison", + "comparison_op", "fill_binop", + "get_array_op", + "get_op_result_name", + "invalid_comparison", "kleene_and", "kleene_or", "kleene_xor", "logical_op", "make_flex_doc", + "maybe_prepare_scalar_for_op", "radd", "rand_", "rdiv", @@ -88,7 +91,4 @@ "rtruediv", "rxor", "unpack_zerodim_and_defer", - "get_op_result_name", - "maybe_prepare_scalar_for_op", - "get_array_op", ] diff --git a/pandas/core/resample.py b/pandas/core/resample.py index ca4d3fc768efb..1cfc75ea11725 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -31,10 +31,7 @@ Substitution, doc, ) -from pandas.util._exceptions import ( - find_stack_level, - rewrite_warning, -) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -59,7 +56,6 @@ from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, - _apply_groupings_depr, _pipe_template, get_groupby, ) @@ -167,14 +163,15 @@ def __init__( gpr_index: Index, group_keys: bool = False, selection=None, - include_groups: bool = True, + include_groups: bool = False, ) -> None: + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") self._timegrouper = timegrouper self.keys = None self.sort = True self.group_keys = group_keys self.as_index = True - self.include_groups = include_groups self.obj, self.ax, self._indexer = self._timegrouper._set_grouper( self._convert_obj(obj), sort=True, gpr_index=gpr_index @@ -381,10 +378,20 @@ def transform(self, arg, *args, **kwargs): ---------- arg : function To apply to each group. Should return a Series with the same index. + *args, **kwargs + Additional arguments and keywords. Returns ------- Series + A Series with the transformed values, maintaining the same index as + the original object. + + See Also + -------- + core.resample.Resampler.apply : Apply a function along each group. + core.resample.Resampler.aggregate : Aggregate using one or more operations + over the specified axis. Examples -------- @@ -465,9 +472,7 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # a DataFrame column, but aggregate_item_by_item operates column-wise # on Series, raising AttributeError or KeyError # (depending on whether the column lookup uses getattr/__getitem__) - result = _apply( - grouped, how, *args, include_groups=self.include_groups, **kwargs - ) + result = grouped.apply(how, *args, **kwargs) except ValueError as err: if "Must produce aggregated value" in str(err): @@ -479,21 +484,23 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # we have a non-reducing function # try to evaluate - result = _apply( - grouped, how, *args, include_groups=self.include_groups, **kwargs - ) + result = grouped.apply(how, *args, **kwargs) return self._wrap_result(result) @final def _get_resampler_for_grouping( - self, groupby: GroupBy, key, include_groups: bool = True + self, + groupby: GroupBy, + key, ): """ Return the correct class for resampling with groupby. """ return self._resampler_for_grouping( - groupby=groupby, key=key, parent=self, include_groups=include_groups + groupby=groupby, + key=key, + parent=self, ) def _wrap_result(self, result): @@ -694,7 +701,7 @@ def bfill(self, limit: int | None = None): References ---------- - .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics) + .. [1] https://en.wikipedia.org/wiki/Imputation_%28statistics%29 Examples -------- @@ -935,7 +942,7 @@ def interpolate( "supported. If you tried to resample and interpolate on a " "grouped data frame, please use:\n" "`df.groupby(...).apply(lambda x: x.resample(...)." - "interpolate(...), include_groups=False)`" + "interpolate(...))`" "\ninstead, as resampling and interpolation has to be " "performed for each group independently." ) @@ -1099,6 +1106,13 @@ def prod( Series or DataFrame Computed prod of values within each group. + See Also + -------- + core.resample.Resampler.sum : Compute sum of groups, excluding missing values. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. + Examples -------- >>> ser = pd.Series( @@ -1129,9 +1143,30 @@ def min( """ Compute min value of group. + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + Returns ------- Series or DataFrame + Compute the minimum value in the given Series or DataFrame. + + See Also + -------- + core.resample.Resampler.max : Compute max value of group. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. Examples -------- @@ -1163,9 +1198,30 @@ def max( """ Compute max value of group. + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + Returns ------- Series or DataFrame + Computes the maximum value in the given Series or Dataframe. + + See Also + -------- + core.resample.Resampler.min : Compute min value of group. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. Examples -------- @@ -1213,8 +1269,53 @@ def last( ) @final - @doc(GroupBy.median) def median(self, numeric_only: bool = False): + """ + Compute median of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None`` and defaults to False. + + Returns + ------- + Series or DataFrame + Median of values within each group. + + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby to each row or column of a + DataFrame. + + Examples + -------- + + >>> ser = pd.Series( + ... [1, 2, 3, 3, 4, 5], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").median() + 2023-01-01 2.0 + 2023-02-01 4.0 + Freq: MS, dtype: float64 + """ return self._downsample("median", numeric_only=numeric_only) @final @@ -1239,6 +1340,16 @@ def mean( DataFrame or Series Mean of values within each group. + See Also + -------- + core.resample.Resampler.median : Compute median of groups, excluding missing + values. + core.resample.Resampler.sum : Compute sum of groups, excluding missing values. + core.resample.Resampler.std : Compute standard deviation of groups, excluding + missing values. + core.resample.Resampler.var : Compute variance of groups, excluding missing + values. + Examples -------- @@ -1288,6 +1399,14 @@ def std( DataFrame or Series Standard deviation of values within each group. + See Also + -------- + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. + core.resample.Resampler.var : Compute variance of groups, excluding missing + values. + Examples -------- @@ -1339,6 +1458,14 @@ def var( DataFrame or Series Variance of values within each group. + See Also + -------- + core.resample.Resampler.std : Compute standard deviation of groups, excluding + missing values. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. + Examples -------- @@ -1368,12 +1495,61 @@ def var( return self._downsample("var", ddof=ddof, numeric_only=numeric_only) @final - @doc(GroupBy.sem) def sem( self, ddof: int = 1, numeric_only: bool = False, ): + """ + Compute standard error of the mean of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + Series or DataFrame + Standard error of the mean of values within each group. + + See Also + -------- + DataFrame.sem : Return unbiased standard error of the mean over requested axis. + Series.sem : Return unbiased standard error of the mean over requested axis. + + Examples + -------- + + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").sem() + 2023-01-01 0.577350 + 2023-02-01 1.527525 + Freq: MS, dtype: float64 + """ return self._downsample("sem", ddof=ddof, numeric_only=numeric_only) @final @@ -1541,7 +1717,6 @@ def __init__( groupby: GroupBy, key=None, selection: IndexLabel | None = None, - include_groups: bool = False, ) -> None: # reached via ._gotitem and _get_resampler_for_grouping @@ -1564,7 +1739,6 @@ def __init__( self.ax = parent.ax self.obj = parent.obj - self.include_groups = include_groups @no_type_check def _apply(self, f, *args, **kwargs): @@ -1581,7 +1755,7 @@ def func(x): return x.apply(f, *args, **kwargs) - result = _apply(self._groupby, func, include_groups=self.include_groups) + result = self._groupby.apply(func) return self._wrap_result(result) _upsample = _apply @@ -1937,7 +2111,6 @@ def get_resampler_for_grouping( fill_method=None, limit: int | None = None, on=None, - include_groups: bool = True, **kwargs, ) -> Resampler: """ @@ -1946,9 +2119,7 @@ def get_resampler_for_grouping( # .resample uses 'on' similar to how .groupby uses 'key' tg = TimeGrouper(freq=rule, key=on, **kwargs) resampler = tg._get_resampler(groupby.obj) - return resampler._get_resampler_for_grouping( - groupby=groupby, include_groups=include_groups, key=tg.key - ) + return resampler._get_resampler_for_grouping(groupby=groupby, key=tg.key) class TimeGrouper(Grouper): @@ -2002,9 +2173,7 @@ def __init__( raise ValueError(f"Unsupported value {convention} for `convention`") if ( - key is None - and obj is not None - and isinstance(obj.index, PeriodIndex) # type: ignore[attr-defined] + (key is None and obj is not None and isinstance(obj.index, PeriodIndex)) # type: ignore[attr-defined] or ( key is not None and obj is not None @@ -2729,18 +2898,3 @@ def _asfreq_compat(index: FreqIndexT, freq) -> FreqIndexT: else: # pragma: no cover raise TypeError(type(index)) return new_index - - -def _apply( - grouped: GroupBy, how: Callable, *args, include_groups: bool, **kwargs -) -> DataFrame: - # GH#7155 - rewrite warning to appear as if it came from `.resample` - target_message = "DataFrameGroupBy.apply operated on the grouping columns" - new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample") - with rewrite_warning( - target_message=target_message, - target_category=DeprecationWarning, - new_message=new_message, - ): - result = grouped.apply(how, *args, include_groups=include_groups, **kwargs) - return result diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index cfe83111b6e38..e7cb7069bbc26 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -17,6 +17,7 @@ import numpy as np from pandas._libs import lib +from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( @@ -149,6 +150,7 @@ def concat( ) -> DataFrame | Series: ... +@set_module("pandas") def concat( objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], *, diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index bfd8e3ccd2f7c..f4cb82816bbcf 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -51,9 +51,9 @@ def melt( """ Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. - This function is useful to massage a DataFrame into a format where one + This function is useful to reshape a DataFrame into a format where one or more columns are identifier variables (`id_vars`), while all other - columns, considered measured variables (`value_vars`), are "unpivoted" to + columns are considered measured variables (`value_vars`), and are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 07e8fa4841c04..5fddd9f9aca5b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -39,7 +39,10 @@ npt, ) from pandas.errors import MergeError -from pandas.util._decorators import cache_readonly +from pandas.util._decorators import ( + cache_readonly, + set_module, +) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ExtensionDtype @@ -123,11 +126,22 @@ # See https://github.com/pandas-dev/pandas/issues/52451 if np.intc is not np.int32: - _factorizers[np.intc] = libhashtable.Int64Factorizer + if np.dtype(np.intc).itemsize == 4: + _factorizers[np.intc] = libhashtable.Int32Factorizer + else: + _factorizers[np.intc] = libhashtable.Int64Factorizer + +if np.uintc is not np.uint32: + if np.dtype(np.uintc).itemsize == 4: + _factorizers[np.uintc] = libhashtable.UInt32Factorizer + else: + _factorizers[np.uintc] = libhashtable.UInt64Factorizer + _known = (np.ndarray, ExtensionArray, Index, ABCSeries) +@set_module("pandas") def merge( left: DataFrame | Series, right: DataFrame | Series, @@ -492,6 +506,7 @@ def _groupby_and_merge( return result, lby +@set_module("pandas") def merge_ordered( left: DataFrame | Series, right: DataFrame | Series, @@ -635,6 +650,7 @@ def _merger(x, y) -> DataFrame: return result +@set_module("pandas") def merge_asof( left: DataFrame | Series, right: DataFrame | Series, @@ -2730,8 +2746,7 @@ def _factorize_keys( isinstance(lk.dtype, ArrowDtype) and ( is_numeric_dtype(lk.dtype.numpy_dtype) - or is_string_dtype(lk.dtype) - and not sort + or (is_string_dtype(lk.dtype) and not sort) ) ): lk, _ = lk._values_for_factorize() diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index b3f946f289891..034b861a83f43 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -73,7 +73,7 @@ def cut( Parameters ---------- - x : array-like + x : 1d ndarray or Series The input array to be binned. Must be 1-dimensional. bins : int, sequence of scalars, or IntervalIndex The criteria to bin by. @@ -126,7 +126,7 @@ def cut( Categorical for all other inputs. The values stored within are whatever the type in the sequence is. - * False : returns an ndarray of integers. + * False : returns a 1d ndarray or Series of integers. bins : numpy.ndarray or IntervalIndex. The computed or specified bins. Only returned when `retbins=True`. diff --git a/pandas/core/series.py b/pandas/core/series.py index fe2bb0b5aa5c3..4fa8b86fa4c16 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -50,6 +50,7 @@ Substitution, deprecate_nonkeyword_arguments, doc, + set_module, ) from pandas.util._validators import ( validate_ascending, @@ -229,6 +230,7 @@ # error: Cannot override final attribute "size" (previously declared in base # class "NDFrame") # definition in base class "NDFrame" +@set_module("pandas") class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] """ One-dimensional ndarray with axis labels (including time series). @@ -565,7 +567,7 @@ def __arrow_c_stream__(self, requested_schema=None): Export the pandas Series as an Arrow C stream PyCapsule. This relies on pyarrow to convert the pandas Series to the Arrow - format (and follows the default behaviour of ``pyarrow.Array.from_pandas`` + format (and follows the default behavior of ``pyarrow.Array.from_pandas`` in its handling of the index, i.e. to ignore it). This conversion is not necessarily zero-copy. @@ -842,7 +844,7 @@ def __array__( the dtype is inferred from the data. copy : bool or None, optional - Unused. + See :func:`numpy.asarray`. Returns ------- @@ -879,8 +881,15 @@ def __array__( dtype='datetime64[ns]') """ values = self._values - arr = np.asarray(values, dtype=dtype) - if astype_is_view(values.dtype, arr.dtype): + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.array(values, dtype=dtype, copy=copy) + + if copy is True: + return arr + if copy is False or astype_is_view(values.dtype, arr.dtype): arr = arr.view() arr.flags.writeable = False return arr @@ -2217,7 +2226,7 @@ def drop_duplicates( 5 hippo Name: animal, dtype: object - With the 'keep' parameter, the selection behaviour of duplicated values + With the 'keep' parameter, the selection behavior of duplicated values can be changed. The value 'first' keeps the first occurrence for each set of duplicated entries. The default value of keep is 'first'. @@ -2482,6 +2491,7 @@ def round(self, decimals: int = 0, *args, **kwargs) -> Series: -------- numpy.around : Round values of an np.array. DataFrame.round : Round values of a DataFrame. + Series.dt.round : Round values of data to the specified freq. Notes ----- @@ -3441,7 +3451,7 @@ def sort_values( 4 5.0 dtype: float64 - Sort values ascending order (default behaviour) + Sort values ascending order (default behavior) >>> s.sort_values(ascending=True) 1 1.0 @@ -4088,7 +4098,7 @@ def swaplevel( In the following example, we will swap the levels of the indices. Here, we will swap the levels column-wise, but levels can be swapped row-wise - in a similar manner. Note that column-wise is the default behaviour. + in a similar manner. Note that column-wise is the default behavior. By not supplying any arguments for i and j, we swap the last and second to last indices. diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 5ec4de4ee792b..8ebd84226b5e3 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -12,6 +12,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._typing import ( AlignJoin, @@ -29,6 +31,7 @@ is_extension_array_dtype, is_integer, is_list_like, + is_numeric_dtype, is_object_dtype, is_re, ) @@ -399,7 +402,9 @@ def cons_row(x): # This is a mess. _dtype: DtypeObj | str | None = dtype vdtype = getattr(result, "dtype", None) - if self._is_string: + if _dtype is not None: + pass + elif self._is_string: if is_bool_dtype(vdtype): _dtype = result.dtype elif returns_string: @@ -1374,6 +1379,11 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string starts with a match of a regular expression. + Determines whether each string in the Series or Index starts with a + match to a specified regular expression. This function is especially + useful for validating prefixes, such as ensuring that codes, tags, or + identifiers begin with a specific pattern. + Parameters ---------- pat : str @@ -1419,6 +1429,11 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string entirely matches a regular expression. + Checks if each string in the Series or Index fully matches the + specified regular expression pattern. This function is useful when the + requirement is for an entire string to conform to a pattern, such as + validating formats like phone numbers or email addresses. + Parameters ---------- pat : str @@ -1647,6 +1662,10 @@ def repeat(self, repeats): """ Duplicate each string in the Series or Index. + Duplicates each string in the Series or Index, either by applying the + same repeat count to all elements or by using different repeat values + for each element. + Parameters ---------- repeats : int or sequence of int @@ -1710,6 +1729,12 @@ def pad( """ Pad strings in the Series/Index up to width. + This function pads strings in a Series or Index to a specified width, + filling the extra space with a character of your choice. It provides + flexibility in positioning the padding, allowing it to be added to the + left, right, or both sides. This is useful for formatting strings to + align text or ensure consistent string lengths in data processing. + Parameters ---------- width : int @@ -1920,6 +1945,11 @@ def slice(self, start=None, stop=None, step=None): """ Slice substrings from each element in the Series or Index. + Slicing substrings from strings in a Series or Index helps extract + specific portions of data, making it easier to analyze or manipulate + text. This is useful for tasks like parsing structured text fields or + isolating parts of strings with a consistent format. + Parameters ---------- start : int, optional @@ -1996,6 +2026,11 @@ def slice_replace(self, start=None, stop=None, repl=None): """ Replace a positional slice of a string with another value. + This function allows replacing specific parts of a string in a Series + or Index by specifying start and stop positions. It is useful for + modifying substrings in a controlled way, such as updating sections of + text based on their positions or patterns. + Parameters ---------- start : int, optional @@ -2110,9 +2145,9 @@ def decode(self, encoding, errors: str = "strict"): decoder = codecs.getdecoder(encoding) f = lambda x: decoder(x, errors)[0] arr = self._data.array - # assert isinstance(arr, (StringArray,)) result = arr._str_map(f) - return self._wrap_result(result) + dtype = "str" if get_option("future.infer_string") else None + return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) def encode(self, encoding, errors: str = "strict"): @@ -2494,10 +2529,12 @@ def get_dummies( """ from pandas.core.frame import DataFrame + if dtype is not None and not (is_numeric_dtype(dtype) or is_bool_dtype(dtype)): + raise ValueError("Only numeric or boolean dtypes are supported for 'dtype'") # we need to cast to Series of strings as only that has all # methods available for making the dummies... result, name = self._data.array._str_get_dummies(sep, dtype) - if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype): + if is_extension_array_dtype(dtype): return self._wrap_result( DataFrame(result, columns=name, dtype=dtype), name=name, @@ -3385,7 +3422,8 @@ def len(self): # cases: # upper, lower, title, capitalize, swapcase, casefold # boolean: - # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle + # isalpha, isnumeric isalnum isdigit isdecimal isspace islower + # isupper istitle isascii # _doc_args holds dict of strings to use in substituting casemethod docs _doc_args: dict[str, dict[str, str]] = {} _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} @@ -3465,6 +3503,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3488,6 +3527,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3514,6 +3554,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3546,6 +3587,7 @@ def casefold(self): Series.str.isdigit : Check whether all characters are digits. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3571,6 +3613,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3597,6 +3640,7 @@ def casefold(self): Series.str.isdigit : Check whether all characters are digits. Series.str.isdecimal : Check whether all characters are decimal. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3619,6 +3663,7 @@ def casefold(self): Series.str.isdigit : Check whether all characters are digits. Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3644,6 +3689,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.istitle : Check whether all characters are titlecase. Examples @@ -3667,10 +3713,11 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Examples - ------------ + -------- The ``s5.str.istitle`` method checks for whether all words are in title case (whether only the first letter of each word is capitalized). Words are assumed to be as any sequence of non-numeric characters separated by @@ -3684,11 +3731,40 @@ def casefold(self): 3 False dtype: bool """ + _shared_docs["isascii"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.istitle : Check whether all characters are titlecase. + Series.str.isupper : Check whether all characters are uppercase. + + Examples + ------------ + The ``s5.str.isascii`` method checks for whether all characters are ascii + characters, which includes digits 0-9, capital and lowercase letters A-Z, + and some other special characters. + + >>> s5 = pd.Series(['ö', 'see123', 'hello world', '']) + >>> s5.str.isascii() + 0 False + 1 True + 2 True + 3 True + dtype: bool + """ + _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"} _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"} _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"} _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"} _doc_args["islower"] = {"type": "lowercase", "method": "islower"} + _doc_args["isascii"] = {"type": "ascii", "method": "isascii"} _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"} _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"} _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"} @@ -3720,6 +3796,11 @@ def casefold(self): docstring=_shared_docs["ismethods"] % _doc_args["islower"] + _shared_docs["islower"], ) + isascii = _map_and_wrap( + "isascii", + docstring=_shared_docs["ismethods"] % _doc_args["isascii"] + + _shared_docs["isascii"], + ) isupper = _map_and_wrap( "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 4ed36f85167c9..78c4f3acbe1aa 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -179,6 +179,10 @@ def _str_isalnum(self): def _str_isalpha(self): pass + @abc.abstractmethod + def _str_isascii(self): + pass + @abc.abstractmethod def _str_isdecimal(self): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 0268194e64d50..0adb7b51cf2b7 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -434,7 +434,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): dummies_dtype = _dtype else: dummies_dtype = np.bool_ - dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype) + dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype, order="F") def _isin(test_elements: str, element: str) -> bool: return element in test_elements @@ -455,6 +455,9 @@ def _str_isalnum(self): def _str_isalpha(self): return self._str_map(str.isalpha, dtype="bool") + def _str_isascii(self): + return self._str_map(str.isascii, dtype="bool") + def _str_isdecimal(self): return self._str_map(str.isdecimal, dtype="bool") diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 4680a63bf57a1..30487de7bafd5 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -44,6 +44,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_float, + is_float_dtype, is_integer, is_integer_dtype, is_list_like, @@ -1153,6 +1154,10 @@ def coerce(values): # we allow coercion to if errors allows values = to_numeric(values, errors=errors) + # prevent prevision issues in case of float32 # GH#60506 + if is_float_dtype(values.dtype): + values = values.astype("float64") + # prevent overflow in case of int8 or int16 if is_integer_dtype(values.dtype): values = values.astype("int64") diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index f159babb7e018..bc45343d6e2d3 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -226,19 +226,18 @@ def to_numeric( set(), coerce_numeric=coerce_numeric, convert_to_masked_nullable=dtype_backend is not lib.no_default - or isinstance(values_dtype, StringDtype) - and values_dtype.na_value is libmissing.NA, + or ( + isinstance(values_dtype, StringDtype) + and values_dtype.na_value is libmissing.NA + ), ) if new_mask is not None: # Remove unnecessary values, is expected later anyway and enables # downcasting values = values[~new_mask] - elif ( - dtype_backend is not lib.no_default - and new_mask is None - or isinstance(values_dtype, StringDtype) - and values_dtype.na_value is libmissing.NA + elif (dtype_backend is not lib.no_default and new_mask is None) or ( + isinstance(values_dtype, StringDtype) and values_dtype.na_value is libmissing.NA ): new_mask = np.zeros(values.shape, dtype=np.bool_) diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py index cdb670ee218b4..6dbc52a99e70c 100644 --- a/pandas/core/window/doc.py +++ b/pandas/core/window/doc.py @@ -85,6 +85,63 @@ def create_section_header(header: str) -> str: """ ).replace("\n", "", 1) +template_pipe = """ +Apply a ``func`` with arguments to this %(klass)s object and return its result. + +Use `.pipe` when you want to improve readability by chaining together +functions that expect Series, DataFrames, GroupBy, Rolling, Expanding or Resampler +objects. +Instead of writing + +>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 +>>> g = lambda x, arg1: x * 5 / arg1 +>>> f = lambda x: x ** 4 +>>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, index=pd.date_range('2012-08-02', periods=4)) +>>> h(g(f(df.rolling('2D')), arg1=1), arg2=2, arg3=3) # doctest: +SKIP + +You can write + +>>> (df.rolling('2D') +... .pipe(f) +... .pipe(g, arg1=1) +... .pipe(h, arg2=2, arg3=3)) # doctest: +SKIP + +which is much more readable. + +Parameters +---------- +func : callable or tuple of (callable, str) + Function to apply to this %(klass)s object or, alternatively, + a `(callable, data_keyword)` tuple where `data_keyword` is a + string indicating the keyword of `callable` that expects the + %(klass)s object. +*args : iterable, optional + Positional arguments passed into `func`. +**kwargs : dict, optional + A dictionary of keyword arguments passed into `func`. + +Returns +------- +%(klass)s + The original object with the function `func` applied. + +See Also +-------- +Series.pipe : Apply a function with arguments to a series. +DataFrame.pipe: Apply a function with arguments to a dataframe. +apply : Apply function to each group instead of to the + full %(klass)s object. + +Notes +----- +See more `here +`_ + +Examples +-------- +%(examples)s +""" + numba_notes = ( "See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for " "extended documentation and performance considerations for the Numba engine.\n\n" diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 43a3c03b6cef9..73e4de6ea6208 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -490,7 +490,7 @@ def online( klass="Series/Dataframe", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate @@ -981,7 +981,7 @@ def reset(self) -> None: """ self._mean.reset() - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): raise NotImplementedError("aggregate is not implemented.") def std(self, bias: bool = False, *args, **kwargs): diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 4bf77b3d38689..81c89e1ef5428 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -5,9 +5,15 @@ TYPE_CHECKING, Any, Literal, + final, + overload, ) -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + Substitution, + doc, +) from pandas.core.indexers.objects import ( BaseIndexer, @@ -20,6 +26,7 @@ kwargs_numeric_only, numba_notes, template_header, + template_pipe, template_returns, template_see_also, window_agg_numba_parameters, @@ -34,7 +41,11 @@ from collections.abc import Callable from pandas._typing import ( + Concatenate, + P, QuantileInterpolation, + Self, + T, WindowingRankType, ) @@ -167,7 +178,7 @@ def _get_window_indexer(self) -> BaseIndexer: klass="Series/Dataframe", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate @@ -241,6 +252,54 @@ def apply( kwargs=kwargs, ) + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + @final + @Substitution( + klass="Expanding", + examples=""" + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each expanding window's maximum and minimum + value in one pass, you can do + + >>> df.expanding().pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 0.0 + 2012-08-03 1.0 + 2012-08-04 2.0 + 2012-08-05 3.0""", + ) + @Appender(template_pipe) + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return super().pipe(func, *args, **kwargs) + @doc( template_header, create_section_header("Parameters"), @@ -664,6 +723,78 @@ def skew(self, numeric_only: bool = False): def kurt(self, numeric_only: bool = False): return super().kurt(numeric_only=numeric_only) + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.first : Similar method for GroupBy objects. + Expanding.last : Method to get the last element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show an expanding calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.expanding(3).first() + 0 NaN + 1 NaN + 2 0.0 + 3 0.0 + 4 0.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="First (left-most) element of the window", + agg_method="first", + ) + def first(self, numeric_only: bool = False): + return super().first(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.last : Similar method for GroupBy objects. + Expanding.first : Method to get the first element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show an expanding calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.expanding(3).last() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + 4 4.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="Last (right-most) element of the window", + agg_method="last", + ) + def last(self, numeric_only: bool = False): + return super().last(numeric_only=numeric_only) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b1c37ab48fa57..b954ce2584c13 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -14,6 +14,8 @@ TYPE_CHECKING, Any, Literal, + final, + overload, ) import numpy as np @@ -26,7 +28,11 @@ import pandas._libs.window.aggregations as window_aggregations from pandas.compat._optional import import_optional_dependency from pandas.errors import DataError -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + Substitution, + doc, +) from pandas.core.dtypes.common import ( ensure_float64, @@ -44,7 +50,10 @@ from pandas.core._numba import executor from pandas.core.algorithms import factorize -from pandas.core.apply import ResamplerWindowApply +from pandas.core.apply import ( + ResamplerWindowApply, + reconstruct_func, +) from pandas.core.arrays import ExtensionArray from pandas.core.base import SelectionMixin import pandas.core.common as com @@ -78,6 +87,7 @@ kwargs_scipy, numba_notes, template_header, + template_pipe, template_returns, template_see_also, window_agg_numba_parameters, @@ -99,8 +109,12 @@ from pandas._typing import ( ArrayLike, + Concatenate, NDFrameT, QuantileInterpolation, + P, + Self, + T, WindowingRankType, npt, ) @@ -269,7 +283,7 @@ def _create_data(self, obj: NDFrameT, numeric_only: bool = False) -> NDFrameT: """ # filter out the on from the object if self.on is not None and not isinstance(self.on, Index) and obj.ndim == 2: - obj = obj.reindex(columns=obj.columns.difference([self.on])) + obj = obj.reindex(columns=obj.columns.difference([self.on], sort=False)) if obj.ndim > 1 and numeric_only: obj = self._make_numeric_only(obj) return obj @@ -646,8 +660,12 @@ def _numba_apply( out = obj._constructor(result, index=index, columns=columns) return self._resolve_output(out, obj) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): + relabeling, func, columns, order = reconstruct_func(func, **kwargs) result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() + if isinstance(result, ABCDataFrame) and relabeling: + result = result.iloc[:, order] + result.columns = columns # type: ignore[union-attr] if result is None: return self.apply(func, raw=False, args=args, kwargs=kwargs) return result @@ -911,14 +929,21 @@ class Window(BaseWindow): an integer index is not used to calculate the rolling window. closed : str, default None - If ``'right'``, the first point in the window is excluded from calculations. + Determines the inclusivity of points in the window + If ``'right'``, (First, Last] the last point in the window + is included in the calculations. + + If ``'left'``, [First, Last) the first point in the window + is included in the calculations. - If ``'left'``, the last point in the window is excluded from calculations. + If ``'both'``, [First, Last] all points in the window + are included in the calculations. - If ``'both'``, no point in the window is excluded from calculations. + If ``'neither'``, (First, Last) the first and last points + in the window are excludedfrom calculations. - If ``'neither'``, the first and last points in the window are excluded - from calculations. + () and [] are referencing open and closed set + notation respetively. Default ``None`` (``'right'``). @@ -1239,7 +1264,7 @@ def calc(x): klass="Series/DataFrame", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: # these must apply directly @@ -1522,6 +1547,30 @@ def apply_func(values, begin, end, min_periods, raw=raw): return apply_func + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return com.pipe(self, func, *args, **kwargs) + def sum( self, numeric_only: bool = False, @@ -1698,6 +1747,22 @@ def kurt(self, numeric_only: bool = False): numeric_only=numeric_only, ) + def first(self, numeric_only: bool = False): + window_func = window_aggregations.roll_first + return self._apply( + window_func, + name="first", + numeric_only=numeric_only, + ) + + def last(self, numeric_only: bool = False): + window_func = window_aggregations.roll_last + return self._apply( + window_func, + name="last", + numeric_only=numeric_only, + ) + def quantile( self, q: float, @@ -1951,7 +2016,7 @@ def _raise_monotonic_error(self, msg: str): klass="Series/Dataframe", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate @@ -2037,6 +2102,54 @@ def apply( kwargs=kwargs, ) + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + @final + @Substitution( + klass="Rolling", + examples=""" + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each rolling 2-day window's maximum and minimum + value in one pass, you can do + + >>> df.rolling('2D').pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 0.0 + 2012-08-03 1.0 + 2012-08-04 1.0 + 2012-08-05 1.0""", + ) + @Appender(template_pipe) + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return super().pipe(func, *args, **kwargs) + @doc( template_header, create_section_header("Parameters"), @@ -2532,6 +2645,78 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): def kurt(self, numeric_only: bool = False): return super().kurt(numeric_only=numeric_only) + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.first : Similar method for GroupBy objects. + Rolling.last : Method to get the last element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.rolling(3).first() + 0 NaN + 1 NaN + 2 0.0 + 3 1.0 + 4 2.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="First (left-most) element of the window", + agg_method="first", + ) + def first(self, numeric_only: bool = False): + return super().first(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.last : Similar method for GroupBy objects. + Rolling.first : Method to get the first element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.rolling(3).last() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + 4 4.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="Last (right-most) element of the window", + agg_method="last", + ) + def last(self, numeric_only: bool = False): + return super().last(numeric_only=numeric_only) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 0aaee1ec177ee..2b5bc450e41d6 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -20,6 +20,16 @@ class IntCastingNaNError(ValueError): """ Exception raised when converting (``astype``) an array with NaN to an integer type. + This error occurs when attempting to cast a data structure containing non-finite + values (such as NaN or infinity) to an integer data type. Integer types do not + support non-finite values, so such conversions are explicitly disallowed to + prevent silent data corruption or unexpected behavior. + + See Also + -------- + DataFrame.astype : Method to cast a pandas DataFrame object to a specified dtype. + Series.astype : Method to cast a pandas Series object to a specified dtype. + Examples -------- >>> pd.DataFrame(np.array([[1, np.nan], [2, 3]]), dtype="i8") @@ -35,6 +45,11 @@ class NullFrequencyError(ValueError): Particularly ``DatetimeIndex.shift``, ``TimedeltaIndex.shift``, ``PeriodIndex.shift``. + See Also + -------- + Index.shift : Shift values of Index. + Series.shift : Shift values of Series. + Examples -------- >>> df = pd.DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) @@ -48,6 +63,12 @@ class PerformanceWarning(Warning): """ Warning raised when there is a possible performance impact. + See Also + -------- + DataFrame.set_index : Set the DataFrame index using existing columns. + DataFrame.loc : Access a group of rows and columns by label(s) \ + or a boolean array. + Examples -------- >>> df = pd.DataFrame( @@ -100,6 +121,11 @@ class UnsortedIndexError(KeyError): Subclass of `KeyError`. + See Also + -------- + DataFrame.sort_index : Sort a DataFrame by its index. + DataFrame.set_index : Set the DataFrame index using existing columns. + Examples -------- >>> df = pd.DataFrame( @@ -370,6 +396,13 @@ class NumbaUtilError(Exception): """ Error raised for unsupported Numba engine routines. + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Series.groupby : Group Series using a mapper or by a Series of columns. + DataFrame.agg : Aggregate using one or more operations over the specified axis. + Series.agg : Aggregate using one or more operations over the specified axis. + Examples -------- >>> df = pd.DataFrame( @@ -388,6 +421,19 @@ class DuplicateLabelError(ValueError): """ Error raised when an operation would introduce duplicate labels. + This error is typically encountered when performing operations on objects + with `allows_duplicate_labels=False` and the operation would result in + duplicate labels in the index. Duplicate labels can lead to ambiguities + in indexing and reduce data integrity. + + See Also + -------- + Series.set_flags : Return a new ``Series`` object with updated flags. + DataFrame.set_flags : Return a new ``DataFrame`` object with updated flags. + Series.reindex : Conform ``Series`` object to new index with optional filling logic. + DataFrame.reindex : Conform ``DataFrame`` object to new index with optional filling + logic. + Examples -------- >>> s = pd.Series([0, 1, 2], index=["a", "b", "c"]).set_flags( @@ -407,6 +453,16 @@ class InvalidIndexError(Exception): """ Exception raised when attempting to use an invalid index key. + This exception is triggered when a user attempts to access or manipulate + data in a pandas DataFrame or Series using an index key that is not valid + for the given object. This may occur in cases such as using a malformed + slice, a mismatched key for a ``MultiIndex``, or attempting to access an index + element that does not exist. + + See Also + -------- + MultiIndex : A multi-level, or hierarchical, index object for pandas objects. + Examples -------- >>> idx = pd.MultiIndex.from_product([["x", "y"], [0, 1]]) @@ -429,6 +485,11 @@ class DataError(Exception): For example, calling ``ohlc`` on a non-numerical column or a function on a rolling window. + See Also + -------- + Series.rolling : Provide rolling window calculations on Series object. + DataFrame.rolling : Provide rolling window calculations on DataFrame object. + Examples -------- >>> ser = pd.Series(["a", "b", "c"]) @@ -482,6 +543,11 @@ class ChainedAssignmentError(Warning): For more information on Copy-on-Write, see :ref:`the user guide`. + See Also + -------- + options.mode.copy_on_write : Global setting for enabling or disabling + Copy-on-Write behavior. + Examples -------- >>> pd.options.mode.copy_on_write = True @@ -500,6 +566,11 @@ class NumExprClobberingError(NameError): to 'numexpr'. 'numexpr' is the default engine value for these methods if the numexpr package is installed. + See Also + -------- + eval : Evaluate a Python expression as a string using various backends. + DataFrame.query : Query the columns of a DataFrame with a boolean expression. + Examples -------- >>> df = pd.DataFrame({"abs": [1, 1, 1]}) @@ -517,6 +588,20 @@ class UndefinedVariableError(NameError): It will also specify whether the undefined variable is local or not. + Parameters + ---------- + name : str + The name of the undefined variable. + is_local : bool or None, optional + Indicates whether the undefined variable is considered a local variable. + If ``True``, the error message specifies it as a local variable. + If ``False`` or ``None``, the variable is treated as a non-local name. + + See Also + -------- + DataFrame.query : Query the columns of a DataFrame with a boolean expression. + DataFrame.eval : Evaluate a string describing operations on DataFrame columns. + Examples -------- >>> df = pd.DataFrame({"A": [1, 1, 1]}) @@ -623,6 +708,15 @@ class PossibleDataLossError(Exception): """ Exception raised when trying to open a HDFStore file when already opened. + This error is triggered when there is a potential risk of data loss due to + conflicting operations on an HDFStore file. It serves to prevent unintended + overwrites or data corruption by enforcing exclusive access to the file. + + See Also + -------- + HDFStore : Dict-like IO interface for storing pandas objects in PyTables. + HDFStore.open : Open an HDFStore file in the specified mode. + Examples -------- >>> store = pd.HDFStore("my-store", "a") # doctest: +SKIP @@ -667,6 +761,12 @@ class AttributeConflictWarning(Warning): name than the existing index on an HDFStore or attempting to append an index with a different frequency than the existing index on an HDFStore. + See Also + -------- + HDFStore : Dict-like IO interface for storing pandas objects in PyTables. + DataFrame.to_hdf : Write the contained data to an HDF5 file using HDFStore. + read_hdf : Read from an HDF5 file into a DataFrame. + Examples -------- >>> idx1 = pd.Index(["a", "b"], name="name1") @@ -720,6 +820,16 @@ class ValueLabelTypeMismatch(Warning): """ Warning raised by to_stata on a category column that contains non-string values. + When exporting data to Stata format using the `to_stata` method, category columns + must have string values as labels. If a category column contains non-string values + (e.g., integers, floats, or other types), this warning is raised to indicate that + the Stata file may not correctly represent the data. + + See Also + -------- + DataFrame.to_stata : Export DataFrame object to Stata dta format. + Series.cat : Accessor for categorical properties of the Series values. + Examples -------- >>> df = pd.DataFrame({"categories": pd.Series(["a", 2], dtype="category")}) @@ -797,28 +907,28 @@ class InvalidComparison(Exception): __all__ = [ "AbstractMethodError", "AttributeConflictWarning", + "CSSWarning", "CategoricalConversionWarning", "ChainedAssignmentError", "ClosedFileError", - "CSSWarning", - "DatabaseError", "DataError", + "DatabaseError", "DtypeWarning", "DuplicateLabelError", "EmptyDataError", "IncompatibilityWarning", + "IndexingError", "IntCastingNaNError", "InvalidColumnName", "InvalidComparison", "InvalidIndexError", "InvalidVersion", - "IndexingError", "LossySetitemError", "MergeError", "NoBufferPresent", "NullFrequencyError", - "NumbaUtilError", "NumExprClobberingError", + "NumbaUtilError", "OptionError", "OutOfBoundsDatetime", "OutOfBoundsTimedelta", diff --git a/pandas/io/__init__.py b/pandas/io/__init__.py index c804b81c49e7c..1c7e531debb14 100644 --- a/pandas/io/__init__.py +++ b/pandas/io/__init__.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH004 +# ruff: noqa: TC004 from typing import TYPE_CHECKING if TYPE_CHECKING: diff --git a/pandas/io/_util.py b/pandas/io/_util.py index a1c3318f04466..6827fbe9c998e 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,9 +1,19 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Literal, +) import numpy as np +from pandas._config import using_string_dtype + +from pandas._libs import lib +from pandas.compat import ( + pa_version_under18p0, + pa_version_under19p0, +) from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -11,6 +21,10 @@ if TYPE_CHECKING: from collections.abc import Callable + import pyarrow + + from pandas._typing import DtypeBackend + def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") @@ -32,10 +46,48 @@ def _arrow_dtype_mapping() -> dict: } -def arrow_string_types_mapper() -> Callable: +def _arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") - return { + mapping = { pa.string(): pd.StringDtype(na_value=np.nan), pa.large_string(): pd.StringDtype(na_value=np.nan), - }.get + } + if not pa_version_under18p0: + mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan) + + return mapping.get + + +def arrow_table_to_pandas( + table: pyarrow.Table, + dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default, + null_to_int64: bool = False, + to_pandas_kwargs: dict | None = None, +) -> pd.DataFrame: + pa = import_optional_dependency("pyarrow") + + to_pandas_kwargs = {} if to_pandas_kwargs is None else to_pandas_kwargs + + types_mapper: type[pd.ArrowDtype] | None | Callable + if dtype_backend == "numpy_nullable": + mapping = _arrow_dtype_mapping() + if null_to_int64: + # Modify the default mapping to also map null to Int64 + # (to match other engines - only for CSV parser) + mapping[pa.null()] = pd.Int64Dtype() + types_mapper = mapping.get + elif dtype_backend == "pyarrow": + types_mapper = pd.ArrowDtype + elif using_string_dtype(): + if pa_version_under19p0: + types_mapper = _arrow_string_types_mapper() + else: + types_mapper = None + elif dtype_backend is lib.no_default or dtype_backend == "numpy": + types_mapper = None + else: + raise NotImplementedError + + df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) + return df diff --git a/pandas/io/common.py b/pandas/io/common.py index a76f0cf6dd34d..e0076eb486976 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -584,6 +584,9 @@ def infer_compression( # Infer compression if compression == "infer": # Convert all path types (e.g. pathlib.Path) to strings + if isinstance(filepath_or_buffer, str) and "::" in filepath_or_buffer: + # chained URLs contain :: + filepath_or_buffer = filepath_or_buffer.split("::")[0] filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True) if not isinstance(filepath_or_buffer, str): # Cannot infer compression of a buffer, assume no compression @@ -910,10 +913,10 @@ def get_handle( or not hasattr(handle, "seekable") ): handle = _IOWrapper(handle) - # error: Argument 1 to "TextIOWrapper" has incompatible type - # "_IOWrapper"; expected "IO[bytes]" + # error: Value of type variable "_BufferT_co" of "TextIOWrapper" cannot + # be "_IOWrapper | BaseBuffer" [type-var] handle = TextIOWrapper( - handle, # type: ignore[arg-type] + handle, # type: ignore[type-var] encoding=ioargs.encoding, errors=errors, newline="", diff --git a/pandas/io/excel/__init__.py b/pandas/io/excel/__init__.py index 275cbf0148f94..f13d7afa63d84 100644 --- a/pandas/io/excel/__init__.py +++ b/pandas/io/excel/__init__.py @@ -8,7 +8,7 @@ from pandas.io.excel._util import register_writer from pandas.io.excel._xlsxwriter import XlsxWriter as _XlsxWriter -__all__ = ["read_excel", "ExcelWriter", "ExcelFile"] +__all__ = ["ExcelFile", "ExcelWriter", "read_excel"] register_writer(_OpenpyxlWriter) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ef52107c283e9..ced2ad91dba1e 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -8,6 +8,7 @@ Sequence, ) import datetime +from decimal import Decimal from functools import partial import os from textwrap import fill @@ -43,6 +44,7 @@ from pandas.core.dtypes.common import ( is_bool, + is_decimal, is_file_like, is_float, is_integer, @@ -1348,6 +1350,8 @@ def _value_with_fmt( val = float(val) elif is_bool(val): val = bool(val) + elif is_decimal(val): + val = Decimal(val) elif isinstance(val, datetime.datetime): fmt = self._datetime_format elif isinstance(val, datetime.date): diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index aaae9857b4fae..565c53f0f3fc5 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -15,11 +15,10 @@ from pandas.util._decorators import doc from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.api import DataFrame from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import get_handle if TYPE_CHECKING: @@ -79,6 +78,14 @@ def read_feather( """ Load a feather-format object from the file path. + Feather is particularly useful for scenarios that require efficient + serialization and deserialization of tabular data. It supports + schema preservation, making it a reliable choice for use cases + such as sharing data between Python and R, or persisting intermediate + results during data processing pipelines. This method provides additional + flexibility with options for selective column reading, thread parallelism, + and choosing the backend for data types. + Parameters ---------- path : str, path object, or file-like object @@ -147,16 +154,4 @@ def read_feather( pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) ) - - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get) - - elif dtype_backend == "pyarrow": - return pa_table.to_pandas(types_mapper=pd.ArrowDtype) - - elif using_string_dtype(): - return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) - else: - raise NotImplementedError + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) diff --git a/pandas/io/formats/__init__.py b/pandas/io/formats/__init__.py index 5e56b1bc7ba43..895669c342f97 100644 --- a/pandas/io/formats/__init__.py +++ b/pandas/io/formats/__init__.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH004 +# ruff: noqa: TC004 from typing import TYPE_CHECKING if TYPE_CHECKING: diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 52b5755558900..5fde6577e9f95 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -37,6 +37,7 @@ DataFrame, Index, MultiIndex, + Period, PeriodIndex, ) import pandas.core.common as com @@ -48,7 +49,6 @@ CSSWarning, ) from pandas.io.formats.format import get_level_lengths -from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: from pandas._typing import ( @@ -620,9 +620,8 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: return columns = self.columns - level_strs = columns._format_multi( - sparsify=self.merge_cells in {True, "columns"}, include_names=False - ) + merge_columns = self.merge_cells in {True, "columns"} + level_strs = columns._format_multi(sparsify=merge_columns, include_names=False) level_lengths = get_level_lengths(level_strs) coloffset = 0 lnum = 0 @@ -630,51 +629,34 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: if self.index and isinstance(self.df.index, MultiIndex): coloffset = self.df.index.nlevels - 1 - if self.merge_cells in {True, "columns"}: - # Format multi-index as a merged cells. - for lnum, name in enumerate(columns.names): - yield ExcelCell( - row=lnum, - col=coloffset, - val=name, - style=None, - ) + for lnum, name in enumerate(columns.names): + yield ExcelCell( + row=lnum, + col=coloffset, + val=name, + style=None, + ) - for lnum, (spans, levels, level_codes) in enumerate( - zip(level_lengths, columns.levels, columns.codes) - ): - values = levels.take(level_codes) - for i, span_val in spans.items(): - mergestart, mergeend = None, None - if span_val > 1: - mergestart, mergeend = lnum, coloffset + i + span_val - yield CssExcelCell( - row=lnum, - col=coloffset + i + 1, - val=values[i], - style=None, - css_styles=getattr(self.styler, "ctx_columns", None), - css_row=lnum, - css_col=i, - css_converter=self.style_converter, - mergestart=mergestart, - mergeend=mergeend, - ) - else: - # Format in legacy format with dots to indicate levels. - for i, values in enumerate(zip(*level_strs)): - v = ".".join(map(pprint_thing, values)) + for lnum, (spans, levels, level_codes) in enumerate( + zip(level_lengths, columns.levels, columns.codes) + ): + values = levels.take(level_codes) + for i, span_val in spans.items(): + mergestart, mergeend = None, None + if merge_columns and span_val > 1: + mergestart, mergeend = lnum, coloffset + i + span_val yield CssExcelCell( row=lnum, col=coloffset + i + 1, - val=v, + val=values[i], style=None, css_styles=getattr(self.styler, "ctx_columns", None), css_row=lnum, css_col=i, css_converter=self.style_converter, + mergestart=mergestart, + mergeend=mergeend, ) - self.rowcounter = lnum def _format_header_regular(self) -> Iterable[ExcelCell]: @@ -798,11 +780,8 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: # MultiIndex columns require an extra row # with index names (blank if None) for - # unambiguous round-trip, unless not merging, - # in which case the names all go on one row Issue #11328 - if isinstance(self.columns, MultiIndex) and ( - self.merge_cells in {True, "columns"} - ): + # unambiguous round-trip, Issue #11328 + if isinstance(self.columns, MultiIndex): self.rowcounter += 1 # if index labels are not empty go ahead and dump @@ -825,6 +804,9 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: allow_fill=levels._can_hold_na, fill_value=levels._na_value, ) + # GH#60099 + if isinstance(values[0], Period): + values = values.to_timestamp() for i, span_val in spans.items(): mergestart, mergeend = None, None @@ -849,6 +831,10 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: # Format hierarchical rows with non-merged values. for indexcolvals in zip(*self.df.index): for idx, indexcolval in enumerate(indexcolvals): + # GH#60099 + if isinstance(indexcolval, Period): + indexcolval = indexcolval.to_timestamp() + yield CssExcelCell( row=self.rowcounter + idx, col=gcolidx, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 861f5885f80c6..46ecb2b9a8f12 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -78,7 +78,6 @@ ) from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.core.reshape.concat import concat from pandas.io.common import ( check_parent_directory, @@ -245,7 +244,11 @@ def _chk_truncate(self) -> None: series = series.iloc[:max_rows] else: row_num = max_rows // 2 - series = concat((series.iloc[:row_num], series.iloc[-row_num:])) + _len = len(series) + _slice = np.hstack( + [np.arange(row_num), np.arange(_len - row_num, _len)] + ) + series = series.iloc[_slice] self.tr_row_num = row_num else: self.tr_row_num = None @@ -669,9 +672,9 @@ def _truncate_horizontally(self) -> None: assert self.max_cols_fitted is not None col_num = self.max_cols_fitted // 2 if col_num >= 1: - left = self.tr_frame.iloc[:, :col_num] - right = self.tr_frame.iloc[:, -col_num:] - self.tr_frame = concat((left, right), axis=1) + _len = len(self.tr_frame.columns) + _slice = np.hstack([np.arange(col_num), np.arange(_len - col_num, _len)]) + self.tr_frame = self.tr_frame.iloc[:, _slice] # truncate formatter if isinstance(self.formatters, (list, tuple)): @@ -682,7 +685,7 @@ def _truncate_horizontally(self) -> None: else: col_num = cast(int, self.max_cols) self.tr_frame = self.tr_frame.iloc[:, :col_num] - self.tr_col_num = col_num + self.tr_col_num: int = col_num def _truncate_vertically(self) -> None: """Remove rows, which are not to be displayed. @@ -1749,7 +1752,7 @@ def _trim_zeros_complex(str_complexes: ArrayLike, decimal: str = ".") -> list[st # The split will give [{"", "-"}, "xxx", "+/-", "xxx", "j", ""] # Therefore, the imaginary part is the 4th and 3rd last elements, # and the real part is everything before the imaginary part - trimmed = re.split(r"([j+-])", x) + trimmed = re.split(r"(? None: use_mathjax = get_option("display.html.use_mathjax") if not use_mathjax: _classes.append("tex2jax_ignore") + _classes.append("mathjax_ignore") if self.classes is not None: if isinstance(self.classes, str): self.classes = self.classes.split() diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 469dcfb76ba0b..b4c6ff8792d52 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -165,7 +165,7 @@ >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> s = pd.Series(text_values, index=int_values) >>> s.info() - + Index: 5 entries, 1 to 5 Series name: None Non-Null Count Dtype @@ -177,7 +177,7 @@ Prints a summary excluding information about its values: >>> s.info(verbose=False) - + Index: 5 entries, 1 to 5 dtypes: object(1) memory usage: 80.0+ bytes @@ -200,7 +200,7 @@ >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6)) >>> s.info() - + RangeIndex: 1000000 entries, 0 to 999999 Series name: None Non-Null Count Dtype @@ -210,7 +210,7 @@ memory usage: 7.6+ MB >>> s.info(memory_usage='deep') - + RangeIndex: 1000000 entries, 0 to 999999 Series name: None Non-Null Count Dtype diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 67b5eb6f5ee5b..a9936ba8c8f2c 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -203,7 +203,7 @@ def pprint_thing( def as_escaped_string( thing: Any, escape_chars: EscapeChars | None = escape_chars ) -> str: - translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"} + translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r", "'": r"\'"} if isinstance(escape_chars, Mapping): if default_escapes: translate.update(escape_chars) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index eb6773310da69..3f37556867954 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1228,6 +1228,111 @@ def to_latex( ) return save_to_buffer(latex, buf=buf, encoding=encoding) + @overload + def to_typst( + self, + buf: FilePath | WriteBuffer[str], + *, + encoding: str | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + max_rows: int | None = ..., + max_columns: int | None = ..., + ) -> None: ... + + @overload + def to_typst( + self, + buf: None = ..., + *, + encoding: str | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + max_rows: int | None = ..., + max_columns: int | None = ..., + ) -> str: ... + + @Substitution(buf=buffering_args, encoding=encoding_args) + def to_typst( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + encoding: str | None = None, + sparse_index: bool | None = None, + sparse_columns: bool | None = None, + max_rows: int | None = None, + max_columns: int | None = None, + ) -> str | None: + """ + Write Styler to a file, buffer or string in Typst format. + + .. versionadded:: 3.0.0 + + Parameters + ---------- + %(buf)s + %(encoding)s + sparse_index : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each row. + Defaults to ``pandas.options.styler.sparse.index`` value. + sparse_columns : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each + column. Defaults to ``pandas.options.styler.sparse.columns`` value. + max_rows : int, optional + The maximum number of rows that will be rendered. Defaults to + ``pandas.options.styler.render.max_rows``, which is None. + max_columns : int, optional + The maximum number of columns that will be rendered. Defaults to + ``pandas.options.styler.render.max_columns``, which is None. + + Rows and columns may be reduced if the number of total elements is + large. This value is set to ``pandas.options.styler.render.max_elements``, + which is 262144 (18 bit browser rendering). + + Returns + ------- + str or None + If `buf` is None, returns the result as a string. Otherwise returns `None`. + + See Also + -------- + DataFrame.to_typst : Write a DataFrame to a file, + buffer or string in Typst format. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + >>> df.style.to_typst() # doctest: +SKIP + + .. code-block:: typst + + #table( + columns: 3, + [], [A], [B], + + [0], [1], [3], + [1], [2], [4], + ) + """ + obj = self._copy(deepcopy=True) + + if sparse_index is None: + sparse_index = get_option("styler.sparse.index") + if sparse_columns is None: + sparse_columns = get_option("styler.sparse.columns") + + text = obj._render_typst( + sparse_columns=sparse_columns, + sparse_index=sparse_index, + max_rows=max_rows, + max_cols=max_columns, + ) + return save_to_buffer( + text, buf=buf, encoding=(encoding if buf is not None else None) + ) + @overload def to_html( self, @@ -1644,7 +1749,7 @@ def _update_ctx_header(self, attrs: DataFrame, axis: AxisInt) -> None: for j in attrs.columns: ser = attrs[j] for i, c in ser.items(): - if not c: + if not c or pd.isna(c): continue css_list = maybe_convert_css_to_tuples(c) if axis == 0: diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 08d9fd938c873..2d1218b007d19 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -77,6 +77,7 @@ class StylerRenderer: template_html_table = env.get_template("html_table.tpl") template_html_style = env.get_template("html_style.tpl") template_latex = env.get_template("latex.tpl") + template_typst = env.get_template("typst.tpl") template_string = env.get_template("string.tpl") def __init__( @@ -232,6 +233,21 @@ def _render_latex( d.update(kwargs) return self.template_latex.render(**d) + def _render_typst( + self, + sparse_index: bool, + sparse_columns: bool, + max_rows: int | None = None, + max_cols: int | None = None, + **kwargs, + ) -> str: + """ + Render a Styler in typst format + """ + d = self._render(sparse_index, sparse_columns, max_rows, max_cols) + d.update(kwargs) + return self.template_typst.render(**d) + def _render_string( self, sparse_index: bool, @@ -366,9 +382,11 @@ def _translate( if not get_option("styler.html.mathjax"): table_attr = table_attr or "" if 'class="' in table_attr: - table_attr = table_attr.replace('class="', 'class="tex2jax_ignore ') + table_attr = table_attr.replace( + 'class="', 'class="tex2jax_ignore mathjax_ignore ' + ) else: - table_attr += ' class="tex2jax_ignore"' + table_attr += ' class="tex2jax_ignore mathjax_ignore"' d.update({"table_attributes": table_attr}) if self.tooltips: @@ -866,7 +884,8 @@ def _translate_latex(self, d: dict, clines: str | None) -> None: or multirow sparsification (so that \multirow and \multicol work correctly). """ index_levels = self.index.nlevels - visible_index_level_n = index_levels - sum(self.hide_index_) + # GH 52218 + visible_index_level_n = max(1, index_levels - sum(self.hide_index_)) d["head"] = [ [ {**col, "cellstyle": self.ctx_columns[r, c - visible_index_level_n]} diff --git a/pandas/io/formats/templates/typst.tpl b/pandas/io/formats/templates/typst.tpl new file mode 100644 index 0000000000000..66de8f31b405e --- /dev/null +++ b/pandas/io/formats/templates/typst.tpl @@ -0,0 +1,12 @@ +#table( + columns: {{ head[0] | length }}, +{% for r in head %} + {% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %} + +{% endfor %} + +{% for r in body %} + {% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %} + +{% endfor %} +) diff --git a/pandas/io/html.py b/pandas/io/html.py index c9897f628fdc9..183af3a03221b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -454,15 +454,26 @@ def row_is_all_th(row): while body_rows and row_is_all_th(body_rows[0]): header_rows.append(body_rows.pop(0)) - header = self._expand_colspan_rowspan(header_rows, section="header") - body = self._expand_colspan_rowspan(body_rows, section="body") - footer = self._expand_colspan_rowspan(footer_rows, section="footer") + header, rem = self._expand_colspan_rowspan(header_rows, section="header") + body, rem = self._expand_colspan_rowspan( + body_rows, + section="body", + remainder=rem, + overflow=len(footer_rows) > 0, + ) + footer, _ = self._expand_colspan_rowspan( + footer_rows, section="footer", remainder=rem, overflow=False + ) return header, body, footer def _expand_colspan_rowspan( - self, rows, section: Literal["header", "footer", "body"] - ) -> list[list]: + self, + rows, + section: Literal["header", "footer", "body"], + remainder: list[tuple[int, str | tuple, int]] | None = None, + overflow: bool = True, + ) -> tuple[list[list], list[tuple[int, str | tuple, int]]]: """ Given a list of s, return a list of text rows. @@ -471,12 +482,20 @@ def _expand_colspan_rowspan( rows : list of node-like List of s section : the section that the rows belong to (header, body or footer). + remainder: list[tuple[int, str | tuple, int]] | None + Any remainder from the expansion of previous section + overflow: bool + If true, return any partial rows as 'remainder'. If not, use up any + partial rows. True by default. Returns ------- list of list Each returned row is a list of str text, or tuple (text, link) if extract_links is not None. + remainder + Remaining partial rows if any. If overflow is False, an empty list + is returned. Notes ----- @@ -485,9 +504,7 @@ def _expand_colspan_rowspan( """ all_texts = [] # list of rows, each a list of str text: str | tuple - remainder: list[ - tuple[int, str | tuple, int] - ] = [] # list of (index, text, nrows) + remainder = remainder if remainder is not None else [] for tr in rows: texts = [] # the output for this row @@ -528,19 +545,20 @@ def _expand_colspan_rowspan( all_texts.append(texts) remainder = next_remainder - # Append rows that only appear because the previous row had non-1 - # rowspan - while remainder: - next_remainder = [] - texts = [] - for prev_i, prev_text, prev_rowspan in remainder: - texts.append(prev_text) - if prev_rowspan > 1: - next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) - all_texts.append(texts) - remainder = next_remainder + if not overflow: + # Append rows that only appear because the previous row had non-1 + # rowspan + while remainder: + next_remainder = [] + texts = [] + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + all_texts.append(texts) + remainder = next_remainder - return all_texts + return all_texts, remainder def _handle_hidden_tables(self, tbl_list, attr_name: str): """ diff --git a/pandas/io/json/__init__.py b/pandas/io/json/__init__.py index 8f4e7a62834b5..39f78e26d6041 100644 --- a/pandas/io/json/__init__.py +++ b/pandas/io/json/__init__.py @@ -7,9 +7,9 @@ from pandas.io.json._table_schema import build_table_schema __all__ = [ - "ujson_dumps", - "ujson_loads", + "build_table_schema", "read_json", "to_json", - "build_table_schema", + "ujson_dumps", + "ujson_loads", ] diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e9c9f5ba225a5..237518b3c8d92 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -36,7 +36,6 @@ from pandas.core.dtypes.dtypes import PeriodDtype from pandas import ( - ArrowDtype, DataFrame, Index, MultiIndex, @@ -48,6 +47,7 @@ from pandas.core.reshape.concat import concat from pandas.core.shared_docs import _shared_docs +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, dedup_names, @@ -364,10 +364,8 @@ def __init__( ) # TODO: Do this timedelta properly in objToJSON.c See GH #15137 - if ( - (obj.ndim == 1) - and (obj.name in set(obj.index.names)) - or len(obj.columns.intersection(obj.index.names)) + if ((obj.ndim == 1) and (obj.name in set(obj.index.names))) or len( + obj.columns.intersection(obj.index.names) ): msg = "Overlapping names between the index and columns" raise ValueError(msg) @@ -940,18 +938,7 @@ def read(self) -> DataFrame | Series: if self.engine == "pyarrow": pyarrow_json = import_optional_dependency("pyarrow.json") pa_table = pyarrow_json.read_json(self.data) - - mapping: type[ArrowDtype] | None | Callable - if self.dtype_backend == "pyarrow": - mapping = ArrowDtype - elif self.dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - else: - mapping = None - - return pa_table.to_pandas(types_mapper=mapping) + return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) elif self.engine == "ujson": if self.lines: if self.chunksize: diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 9d250ee5c08ce..7879be18b52c9 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -239,9 +239,16 @@ def build_table_schema( """ Create a Table schema from ``data``. + This method is a utility to generate a JSON-serializable schema + representation of a pandas Series or DataFrame, compatible with the + Table Schema specification. It enables structured data to be shared + and validated in various applications, ensuring consistency and + interoperability. + Parameters ---------- - data : Series, DataFrame + data : Series or DataFrame + The input data for which the table schema is to be created. index : bool, default True Whether to include ``data.index`` in the schema. primary_key : bool or None, default True @@ -256,6 +263,12 @@ def build_table_schema( Returns ------- dict + A dictionary representing the Table schema. + + See Also + -------- + DataFrame.to_json : Convert the object to a JSON string. + read_json : Convert a JSON string to pandas object. Notes ----- diff --git a/pandas/io/orc.py b/pandas/io/orc.py index f179dafc919e5..a945f3dc38d35 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,16 +9,13 @@ Literal, ) -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.indexes.api import default_index -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( get_handle, is_fsspec_url, @@ -127,21 +124,7 @@ def read_orc( pa_table = orc.read_table( source=source, columns=columns, filesystem=filesystem, **kwargs ) - if dtype_backend is not lib.no_default: - if dtype_backend == "pyarrow": - df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) - else: - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - df = pa_table.to_pandas(types_mapper=mapping.get) - return df - else: - if using_string_dtype(): - types_mapper = arrow_string_types_mapper() - else: - types_mapper = None - return pa_table.to_pandas(types_mapper=types_mapper) + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) def to_orc( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 24415299e799b..6a5a83088e986 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -15,22 +15,19 @@ filterwarnings, ) -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas import ( DataFrame, get_option, ) from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, get_handle, @@ -245,21 +242,11 @@ def read( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, storage_options: StorageOptions | None = None, filesystem=None, + to_pandas_kwargs: dict[str, Any] | None = None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True - to_pandas_kwargs = {} - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - to_pandas_kwargs["types_mapper"] = mapping.get - elif dtype_backend == "pyarrow": - to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] - elif using_string_dtype(): - to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() - path_or_handle, handles, filesystem = _get_path_or_handle( path, filesystem, @@ -280,7 +267,11 @@ def read( "make_block is deprecated", DeprecationWarning, ) - result = pa_table.to_pandas(**to_pandas_kwargs) + result = arrow_table_to_pandas( + pa_table, + dtype_backend=dtype_backend, + to_pandas_kwargs=to_pandas_kwargs, + ) if pa_table.schema.metadata: if b"PANDAS_ATTRS" in pa_table.schema.metadata: @@ -361,6 +352,7 @@ def read( filters=None, storage_options: StorageOptions | None = None, filesystem=None, + to_pandas_kwargs: dict | None = None, **kwargs, ) -> DataFrame: parquet_kwargs: dict[str, Any] = {} @@ -376,6 +368,10 @@ def read( raise NotImplementedError( "filesystem is not implemented for the fastparquet engine." ) + if to_pandas_kwargs is not None: + raise NotImplementedError( + "to_pandas_kwargs is not implemented for the fastparquet engine." + ) path = stringify_path(path) handles = None if is_fsspec_url(path): @@ -466,7 +462,7 @@ def to_parquet( .. versionadded:: 2.1.0 kwargs - Additional keyword arguments passed to the engine + Additional keyword arguments passed to the engine. Returns ------- @@ -505,6 +501,7 @@ def read_parquet( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, filesystem: Any = None, filters: list[tuple] | list[list[tuple]] | None = None, + to_pandas_kwargs: dict | None = None, **kwargs, ) -> DataFrame: """ @@ -578,6 +575,12 @@ def read_parquet( .. versionadded:: 2.1.0 + to_pandas_kwargs : dict | None, default None + Keyword arguments to pass through to :func:`pyarrow.Table.to_pandas` + when ``engine="pyarrow"``. + + .. versionadded:: 3.0.0 + **kwargs Any additional kwargs are passed to the engine. @@ -650,5 +653,6 @@ def read_parquet( storage_options=storage_options, dtype_backend=dtype_backend, filesystem=filesystem, + to_pandas_kwargs=to_pandas_kwargs, **kwargs, ) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 86bb5f190e403..8cadde1ad6537 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,8 +3,6 @@ from typing import TYPE_CHECKING import warnings -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -16,18 +14,14 @@ from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.inference import is_integer -import pandas as pd -from pandas import DataFrame - -from pandas.io._util import ( - _arrow_dtype_mapping, - arrow_string_types_mapper, -) +from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase if TYPE_CHECKING: from pandas._typing import ReadBuffer + from pandas import DataFrame + class ArrowParserWrapper(ParserBase): """ @@ -171,7 +165,8 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: # The only way self.names is not the same length as number of cols is # if we have int index_col. We should just pad the names(they will get # removed anyways) to expected length then. - self.names = list(range(num_cols - len(self.names))) + self.names + columns_prefix = [str(x) for x in range(num_cols - len(self.names))] + self.names = columns_prefix + self.names multi_index_named = False frame.columns = self.names @@ -293,17 +288,8 @@ def read(self) -> DataFrame: "make_block is deprecated", DeprecationWarning, ) - if dtype_backend == "pyarrow": - frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif dtype_backend == "numpy_nullable": - # Modify the default mapping to also - # map null to Int64 (to match other engines) - dtype_mapping = _arrow_dtype_mapping() - dtype_mapping[pa.null()] = pd.Int64Dtype() - frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_string_dtype(): - frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + frame = arrow_table_to_pandas( + table, dtype_backend=dtype_backend, null_to_int64=True + ) - else: - frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 7294efe843cce..e263c69376d05 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -368,7 +368,7 @@ def _agg_index(self, index) -> Index: index_converter = converters.get(self.index_names[i]) is not None try_num_bool = not ( - cast_type and is_string_dtype(cast_type) or index_converter + (cast_type and is_string_dtype(cast_type)) or index_converter ) arr, _ = self._infer_types( diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 99d584db61755..db9547a18b600 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1052,8 +1052,9 @@ def _remove_empty_lines(self, lines: list[list[T]]) -> list[list[T]]: for line in lines if ( len(line) > 1 - or len(line) == 1 - and (not isinstance(line[0], str) or line[0].strip()) + or ( + len(line) == 1 and (not isinstance(line[0], str) or line[0].strip()) + ) ) ] return ret diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index ffc2690a5efdf..54877017f76fc 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -32,7 +32,10 @@ AbstractMethodError, ParserWarning, ) -from pandas.util._decorators import Appender +from pandas.util._decorators import ( + Appender, + set_module, +) from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend @@ -771,6 +774,7 @@ def read_csv( % "filepath_or_buffer", ) ) +@set_module("pandas") def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, @@ -906,6 +910,7 @@ def read_table( % "filepath_or_buffer", ) ) +@set_module("pandas") def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, @@ -1023,6 +1028,7 @@ def read_fwf( ) -> DataFrame: ... +@set_module("pandas") def read_fwf( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 618254fee9259..e18db2e53113f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -86,12 +86,16 @@ PeriodArray, ) from pandas.core.arrays.datetimes import tz_to_dtype +from pandas.core.arrays.string_ import BaseStringArray import pandas.core.common as com from pandas.core.computation.pytables import ( PyTablesExpr, maybe_expression, ) -from pandas.core.construction import extract_array +from pandas.core.construction import ( + array as pd_array, + extract_array, +) from pandas.core.indexes.api import ensure_index from pandas.io.common import stringify_path @@ -126,8 +130,7 @@ npt, ) - from pandas.core.internals.blocks import Block - + from pandas.core.internals import Block # versioning attribute _version = "0.15.2" @@ -3024,6 +3027,9 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if isinstance(node, tables.VLArray): ret = node[0][start:stop] + dtype = getattr(attrs, "value_type", None) + if dtype is not None: + ret = pd_array(ret, dtype=dtype) else: dtype = getattr(attrs, "value_type", None) shape = getattr(attrs, "shape", None) @@ -3263,6 +3269,11 @@ def write_array( elif lib.is_np_dtype(value.dtype, "m"): self._handle.create_array(self.group, key, value.view("i8")) getattr(self.group, key)._v_attrs.value_type = "timedelta64" + elif isinstance(value, BaseStringArray): + vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) + vlarr.append(value.to_numpy()) + node = getattr(self.group, key) + node._v_attrs.value_type = str(value.dtype) elif empty_array: self.write_array_empty(key, value) else: @@ -3295,7 +3306,11 @@ def read( index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) - if using_string_dtype() and is_string_array(values, skipna=True): + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): result = result.astype(StringDtype(na_value=np.nan)) return result @@ -3364,7 +3379,11 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) - if using_string_dtype() and is_string_array(values, skipna=True): + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): df = df.astype(StringDtype(na_value=np.nan)) dfs.append(df) @@ -4738,9 +4757,13 @@ def read( df = DataFrame._from_arrays([values], columns=cols_, index=index_) if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if using_string_dtype() and is_string_array( - values, # type: ignore[arg-type] - skipna=True, + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array( + values, + skipna=True, + ) ): df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) @@ -5210,7 +5233,9 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - data = Series(data, copy=False).str.decode(encoding, errors=errors)._values + ser = Series(data, copy=False).str.decode(encoding, errors=errors) + data = ser.to_numpy() + data.flags.writeable = True else: data = data.astype(dtype, copy=False).astype(object, copy=False) @@ -5298,6 +5323,8 @@ def _dtype_to_kind(dtype_str: str) -> str: kind = "integer" elif dtype_str == "object": kind = "object" + elif dtype_str == "str": + kind = "str" else: raise ValueError(f"cannot interpret dtype of [{dtype_str}]") diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index c5aab4d967cd4..792af5ff713a3 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -22,6 +22,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs.byteswap import ( read_double_with_byteswap, read_float_with_byteswap, @@ -699,6 +701,7 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt = {} js, jb = 0, 0 + infer_string = get_option("future.infer_string") for j in range(self.column_count): name = self.column_names[j] @@ -715,6 +718,9 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False) if self.convert_text and (self.encoding is not None): rslt[name] = self._decode_string(rslt[name].str) + if infer_string: + rslt[name] = rslt[name].astype("str") + js += 1 else: self.close() diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9aff5600cf49b..5652d7fab0c7c 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -45,11 +45,10 @@ from pandas.core.dtypes.common import ( is_dict_like, is_list_like, + is_object_dtype, + is_string_dtype, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - DatetimeTZDtype, -) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas import get_option @@ -58,12 +57,15 @@ Series, ) from pandas.core.arrays import ArrowExtensionArray +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.common import maybe_make_list from pandas.core.internals.construction import convert_object_array from pandas.core.tools.datetimes import to_datetime +from pandas.io._util import arrow_table_to_pandas + if TYPE_CHECKING: from collections.abc import ( Callable, @@ -239,7 +241,7 @@ def read_sql_table( # pyright: ignore[reportOverlappingOverload] schema=..., index_col: str | list[str] | None = ..., coerce_float=..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., columns: list[str] | None = ..., chunksize: None = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., @@ -253,7 +255,7 @@ def read_sql_table( schema=..., index_col: str | list[str] | None = ..., coerce_float=..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., columns: list[str] | None = ..., chunksize: int = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., @@ -266,7 +268,7 @@ def read_sql_table( schema: str | None = None, index_col: str | list[str] | None = None, coerce_float: bool = True, - parse_dates: list[str] | dict[str, str] | None = None, + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None, columns: list[str] | None = None, chunksize: int | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, @@ -370,7 +372,7 @@ def read_sql_query( # pyright: ignore[reportOverlappingOverload] index_col: str | list[str] | None = ..., coerce_float=..., params: list[Any] | Mapping[str, Any] | None = ..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., chunksize: None = ..., dtype: DtypeArg | None = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., @@ -384,7 +386,7 @@ def read_sql_query( index_col: str | list[str] | None = ..., coerce_float=..., params: list[Any] | Mapping[str, Any] | None = ..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., chunksize: int = ..., dtype: DtypeArg | None = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., @@ -397,7 +399,7 @@ def read_sql_query( index_col: str | list[str] | None = None, coerce_float: bool = True, params: list[Any] | Mapping[str, Any] | None = None, - parse_dates: list[str] | dict[str, str] | None = None, + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None, chunksize: int | None = None, dtype: DtypeArg | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, @@ -1316,7 +1318,12 @@ def _harmonize_columns( elif dtype_backend == "numpy" and col_type is float: # floats support NA, can always convert! self.frame[col_name] = df_col.astype(col_type) - + elif ( + using_string_dtype() + and is_string_dtype(col_type) + and is_object_dtype(self.frame[col_name]) + ): + self.frame[col_name] = df_col.astype(col_type) elif dtype_backend == "numpy" and len(df_col) == df_col.count(): # No NA values, can convert ints and bools if col_type is np.dtype("int64") or col_type is bool: @@ -1403,6 +1410,7 @@ def _get_dtype(self, sqltype): DateTime, Float, Integer, + String, ) if isinstance(sqltype, Float): @@ -1422,6 +1430,10 @@ def _get_dtype(self, sqltype): return date elif isinstance(sqltype, Boolean): return bool + elif isinstance(sqltype, String): + if using_string_dtype(): + return StringDtype(na_value=np.nan) + return object @@ -2195,23 +2207,10 @@ def read_table( else: stmt = f"SELECT {select_list} FROM {table_name}" - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - elif using_string_dtype(): - from pandas.io._util import arrow_string_types_mapper - - arrow_string_types_mapper() - else: - mapping = None - with self.con.cursor() as cur: cur.execute(stmt) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, @@ -2279,19 +2278,10 @@ def read_query( if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - else: - mapping = None - with self.con.cursor() as cur: cur.execute(sql) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ed89d5766c306..34d95fb59a21c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2206,15 +2206,15 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype: def _maybe_convert_to_int_keys(convert_dates: dict, varlist: list[Hashable]) -> dict: new_dict = {} - for key in convert_dates: + for key, value in convert_dates.items(): if not convert_dates[key].startswith("%"): # make sure proper fmts - convert_dates[key] = "%" + convert_dates[key] + convert_dates[key] = "%" + value if key in varlist: - new_dict.update({varlist.index(key): convert_dates[key]}) + new_dict[varlist.index(key)] = convert_dates[key] else: if not isinstance(key, int): raise ValueError("convert_dates key must be a column or an integer") - new_dict.update({key: convert_dates[key]}) + new_dict[key] = convert_dates[key] return new_dict @@ -2748,6 +2748,18 @@ def write_file(self) -> None: """ Export DataFrame object to Stata dta format. + This method writes the contents of a pandas DataFrame to a `.dta` file + compatible with Stata. It includes features for handling value labels, + variable types, and metadata like timestamps and data labels. The output + file can then be read and used in Stata or other compatible statistical + tools. + + See Also + -------- + read_stata : Read Stata file into DataFrame. + DataFrame.to_stata : Export DataFrame object to Stata dta format. + io.stata.StataWriter : A class for writing Stata binary dta files. + Examples -------- >>> df = pd.DataFrame( @@ -2867,7 +2879,7 @@ def _write_header( # ds_format - just use 114 self._write_bytes(struct.pack("b", 114)) # byteorder - self._write(byteorder == ">" and "\x01" or "\x02") + self._write((byteorder == ">" and "\x01") or "\x02") # filetype self._write("\x01") # unused @@ -3413,7 +3425,7 @@ def _write_header( # ds_format - 117 bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) # byteorder - bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) + bio.write(self._tag((byteorder == ">" and "MSF") or "LSF", "byteorder")) # number of vars, 2 bytes in 117 and 118, 4 byte in 119 nvar_type = "H" if self._dta_version <= 118 else "I" bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K")) diff --git a/pandas/meson.build b/pandas/meson.build index 435103a954d86..840ac257bba09 100644 --- a/pandas/meson.build +++ b/pandas/meson.build @@ -1,7 +1,8 @@ -incdir_numpy = run_command(py, - [ - '-c', - ''' +incdir_numpy = run_command( + py, + [ + '-c', + ''' import os import numpy as np try: @@ -12,9 +13,9 @@ try: except Exception: incdir = np.get_include() print(incdir) - ''' - ], - check: true + ''', + ], + check: true, ).stdout().strip() inc_np = include_directories(incdir_numpy) @@ -36,9 +37,9 @@ subdirs_list = [ 'plotting', 'tests', 'tseries', - 'util' + 'util', ] -foreach subdir: subdirs_list +foreach subdir : subdirs_list install_subdir(subdir, install_dir: py.get_install_dir() / 'pandas') endforeach @@ -47,6 +48,6 @@ top_level_py_list = [ '_typing.py', '_version.py', 'conftest.py', - 'testing.py' + 'testing.py', ] py.install_sources(top_level_py_list, subdir: 'pandas') diff --git a/pandas/plotting/__init__.py b/pandas/plotting/__init__.py index c7a4c1eacfcae..837bfaf82ca27 100644 --- a/pandas/plotting/__init__.py +++ b/pandas/plotting/__init__.py @@ -80,20 +80,20 @@ __all__ = [ "PlotAccessor", + "andrews_curves", + "autocorrelation_plot", + "bootstrap_plot", "boxplot", "boxplot_frame", "boxplot_frame_groupby", + "deregister_matplotlib_converters", "hist_frame", "hist_series", - "scatter_matrix", - "radviz", - "andrews_curves", - "bootstrap_plot", - "parallel_coordinates", "lag_plot", - "autocorrelation_plot", - "table", + "parallel_coordinates", "plot_params", + "radviz", "register_matplotlib_converters", - "deregister_matplotlib_converters", + "scatter_matrix", + "table", ] diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index b60392368d944..aee872f9ae50a 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -428,7 +428,7 @@ def hist_frame( >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', ... return_type='axes') >>> type(boxplot) - + If ``return_type`` is `None`, a NumPy array of axes with the same shape as ``layout`` is returned: @@ -570,18 +570,23 @@ def boxplot_frame_groupby( Parameters ---------- - grouped : Grouped DataFrame + grouped : DataFrameGroupBy + The grouped DataFrame object over which to create the box plots. subplots : bool * ``False`` - no subplots will be used * ``True`` - create a subplot for each group. - column : column name or list of names, or vector Can be any valid input to groupby. fontsize : float or str - rot : label rotation angle - grid : Setting this to True will show the grid + Font size for the labels. + rot : float + Rotation angle of labels (in degrees) on the x-axis. + grid : bool + Whether to show grid lines on the plot. ax : Matplotlib axis object, default None - figsize : A tuple (width, height) in inches + The axes on which to draw the plots. If None, uses the current axes. + figsize : tuple of (float, float) + The figure size in inches (width, height). layout : tuple (optional) The layout of the plot: (rows, columns). sharex : bool, default False @@ -599,8 +604,15 @@ def boxplot_frame_groupby( Returns ------- - dict of key/value = group key/DataFrame.boxplot return value - or DataFrame.boxplot return value in case subplots=figures=False + dict or DataFrame.boxplot return value + If ``subplots=True``, returns a dictionary of group keys to the boxplot + return values. If ``subplots=False``, returns the boxplot return value + of a single DataFrame. + + See Also + -------- + DataFrame.boxplot : Create a box plot from a DataFrame. + Series.plot : Plot a Series. Examples -------- @@ -1038,7 +1050,9 @@ def __call__(self, *args, **kwargs): label_name = label_kw or y data.name = label_name else: - match = is_list_like(label_kw) and len(label_kw) == len(y) + # error: Argument 1 to "len" has incompatible type "Any | bool"; + # expected "Sized" [arg-type] + match = is_list_like(label_kw) and len(label_kw) == len(y) # type: ignore[arg-type] if label_kw and not match: raise ValueError( "label should be list-like and same length as y" diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 87f3ca09ad346..ff28868aa0033 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -74,20 +74,20 @@ def plot(data, kind, **kwargs): __all__ = [ - "plot", - "hist_series", - "hist_frame", - "boxplot", - "boxplot_frame", - "boxplot_frame_groupby", - "table", "andrews_curves", "autocorrelation_plot", "bootstrap_plot", + "boxplot", + "boxplot_frame", + "boxplot_frame_groupby", + "deregister", + "hist_frame", + "hist_series", "lag_plot", "parallel_coordinates", + "plot", "radviz", - "scatter_matrix", "register", - "deregister", + "scatter_matrix", + "table", ] diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 6bb10068bee38..5ad30a68ae3c9 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -20,6 +20,7 @@ import pandas as pd import pandas.core.common as com +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import ( @@ -54,7 +55,8 @@ def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> ticks = ax.get_xticks() if is_vertical else ax.get_yticks() if len(ticks) != len(labels): i, remainder = divmod(len(ticks), len(labels)) - assert remainder == 0, remainder + if Version(mpl.__version__) < Version("3.10"): + assert remainder == 0, remainder labels *= i if is_vertical: ax.set_xticklabels(labels, **kwargs) @@ -198,10 +200,7 @@ def _make_plot(self, fig: Figure) -> None: else self.data ) - # error: Argument "data" to "_iter_data" of "MPLPlot" has - # incompatible type "object"; expected "DataFrame | - # dict[Hashable, Series | DataFrame]" - for i, (label, y) in enumerate(self._iter_data(data=data)): # type: ignore[arg-type] + for i, (label, y) in enumerate(self._iter_data(data=data)): ax = self._get_ax(i) kwds = self.kwds.copy() diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 505db4b807cfc..1035150302d2c 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1960,7 +1960,7 @@ def _make_plot(self, fig: Figure) -> None: ) ax.set_title(label) elif self.stacked: - mask = y > 0 + mask = y >= 0 start = np.where(mask, pos_prior, neg_prior) + self._start_base w = self.bar_width / 2 rect = self._plot( diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 97e510982ab93..1a423ad49c294 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -137,10 +137,7 @@ def _make_plot(self, fig: Figure) -> None: if self.by is not None else self.data ) - - # error: Argument "data" to "_iter_data" of "MPLPlot" has incompatible - # type "object"; expected "DataFrame | dict[Hashable, Series | DataFrame]" - for i, (label, y) in enumerate(self._iter_data(data=data)): # type: ignore[arg-type] + for i, (label, y) in enumerate(self._iter_data(data=data)): ax = self._get_ax(i) kwds = self.kwds.copy() diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 5ebbdf886eb32..6c315c4dce184 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -60,7 +60,7 @@ def format_date_labels(ax: Axes, rot) -> None: fig = ax.get_figure() if fig is not None: # should always be a Figure but can technically be None - maybe_adjust_figure(fig, bottom=0.2) + maybe_adjust_figure(fig, bottom=0.2) # type: ignore[arg-type] def table( diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 7face74dcbc89..3f839cefe798e 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -178,14 +178,21 @@ def scatter_matrix( """ Draw a matrix of scatter plots. + Each pair of numeric columns in the DataFrame is plotted against each other, + resulting in a matrix of scatter plots. The diagonal plots can display either + histograms or Kernel Density Estimation (KDE) plots for each variable. + Parameters ---------- frame : DataFrame + The data to be plotted. alpha : float, optional Amount of transparency applied. figsize : (float,float), optional A tuple (width, height) in inches. ax : Matplotlib axis object, optional + An existing Matplotlib axis object for the plots. If None, a new axis is + created. grid : bool, optional Setting this to True will show the grid. diagonal : {'hist', 'kde'} @@ -208,6 +215,14 @@ def scatter_matrix( numpy.ndarray A matrix of scatter plots. + See Also + -------- + plotting.parallel_coordinates : Plots parallel coordinates for multivariate data. + plotting.andrews_curves : Generates Andrews curves for visualizing clusters of + multivariate data. + plotting.radviz : Creates a RadViz visualization. + plotting.bootstrap_plot : Visualizes uncertainty in data via bootstrap sampling. + Examples -------- @@ -374,6 +389,12 @@ def andrews_curves( Returns ------- :class:`matplotlib.axes.Axes` + The matplotlib Axes object with the plot. + + See Also + -------- + plotting.parallel_coordinates : Plot parallel coordinates chart. + DataFrame.plot : Make plots of Series or DataFrame. Examples -------- diff --git a/pandas/testing.py b/pandas/testing.py index 0445fa5b5efc0..433b22bf1107e 100644 --- a/pandas/testing.py +++ b/pandas/testing.py @@ -12,6 +12,6 @@ __all__ = [ "assert_extension_array_equal", "assert_frame_equal", - "assert_series_equal", "assert_index_equal", + "assert_series_equal", ] diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index b23876d9280f7..4a05259a98087 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -261,6 +261,7 @@ class TestApi(Base): "JsonReader", "NaTType", "NAType", + "NoDefault", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", @@ -400,3 +401,36 @@ def test_util_in_top_level(self): def test_set_module(): assert pd.DataFrame.__module__ == "pandas" + assert pd.CategoricalDtype.__module__ == "pandas" + assert pd.PeriodDtype.__module__ == "pandas" + assert pd.IntervalDtype.__module__ == "pandas" + assert pd.SparseDtype.__module__ == "pandas" + assert pd.ArrowDtype.__module__ == "pandas" + assert pd.StringDtype.__module__ == "pandas" + assert pd.Index.__module__ == "pandas" + assert pd.CategoricalIndex.__module__ == "pandas" + assert pd.DatetimeIndex.__module__ == "pandas" + assert pd.IntervalIndex.__module__ == "pandas" + assert pd.MultiIndex.__module__ == "pandas" + assert pd.PeriodIndex.__module__ == "pandas" + assert pd.RangeIndex.__module__ == "pandas" + assert pd.TimedeltaIndex.__module__ == "pandas" + assert pd.Period.__module__ == "pandas" + assert pd.Timestamp.__module__ == "pandas" + assert pd.Timedelta.__module__ == "pandas" + assert pd.concat.__module__ == "pandas" + assert pd.isna.__module__ == "pandas" + assert pd.notna.__module__ == "pandas" + assert pd.merge.__module__ == "pandas" + assert pd.merge_ordered.__module__ == "pandas" + assert pd.merge_asof.__module__ == "pandas" + assert pd.read_csv.__module__ == "pandas" + assert pd.read_table.__module__ == "pandas" + assert pd.read_fwf.__module__ == "pandas" + assert pd.Series.__module__ == "pandas" + assert pd.date_range.__module__ == "pandas" + assert pd.bdate_range.__module__ == "pandas" + assert pd.timedelta_range.__module__ == "pandas" + assert pd.NamedAgg.__module__ == "pandas" + assert api.typing.SeriesGroupBy.__module__ == "pandas.api.typing" + assert api.typing.DataFrameGroupBy.__module__ == "pandas.api.typing" diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index e19c21f81b3e1..0503bf9166ec7 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -218,18 +218,12 @@ def transform(row): def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): # GH 21224 if using_infer_string: - if df.dtypes.iloc[0].storage == "pyarrow": - import pyarrow as pa - - # TODO(infer_string) - # should raise a proper TypeError instead of propagating the pyarrow error - - expected = (expected, pa.lib.ArrowNotImplementedError) - else: - expected = (expected, NotImplementedError) + expected = (expected, NotImplementedError) msg = ( - "can't multiply sequence by non-int of type 'str'|has no kernel|cannot perform" + "can't multiply sequence by non-int of type 'str'" + "|cannot perform cumprod with type str" # NotImplementedError python backend + "|operation 'cumprod' not supported for dtype 'str'" # TypeError pyarrow ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): @@ -259,16 +253,12 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri if func == "median" or func is np.nanmedian or func is np.median: msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" - if using_infer_string: - if series.dtype.storage == "pyarrow": - import pyarrow as pa - - # TODO(infer_string) - # should raise a proper TypeError instead of propagating the pyarrow error - expected = (expected, pa.lib.ArrowNotImplementedError) - else: - expected = (expected, NotImplementedError) - msg = msg + "|does not support|has no kernel|Cannot perform|cannot perform" + if using_infer_string and func == "cumprod": + expected = (expected, NotImplementedError) + + msg = ( + msg + "|does not support|has no kernel|Cannot perform|cannot perform|operation" + ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 732652f24e2eb..ce71cfec535e4 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,9 +4,10 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import WASM +from pandas.compat import ( + HAS_PYARROW, + WASM, +) from pandas.core.dtypes.common import is_number @@ -81,7 +82,6 @@ def test_apply_np_transformer(float_frame, op, how): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -140,7 +140,6 @@ def test_agg_cython_table_series(series, func, expected): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -163,10 +162,17 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(series, func, expected): +def test_agg_cython_table_transform_series(request, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if series.dtype == "string" and func == "cumsum" and not HAS_PYARROW: + request.applymarker( + pytest.mark.xfail( + raises=NotImplementedError, + reason="TODO(infer_string) cumsum not yet implemented for string", + ) + ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/arrays/masked/test_indexing.py b/pandas/tests/arrays/masked/test_indexing.py index 37f38a11cbeae..753d562c87ffa 100644 --- a/pandas/tests/arrays/masked/test_indexing.py +++ b/pandas/tests/arrays/masked/test_indexing.py @@ -8,7 +8,7 @@ class TestSetitemValidation: def _check_setitem_invalid(self, arr, invalid): - msg = f"Invalid value '{invalid!s}' for dtype {arr.dtype}" + msg = f"Invalid value '{invalid!s}' for dtype '{arr.dtype}'" msg = re.escape(msg) with pytest.raises(TypeError, match=msg): arr[0] = invalid diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index c35e8204f3437..1b685100e4931 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.sparse import IntIndex +from pandas.compat.numpy import np_version_gt2 import pandas as pd from pandas import ( @@ -480,3 +481,33 @@ def test_zero_sparse_column(): expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2]) tm.assert_frame_equal(result, expected) + + +def test_array_interface(arr_data, arr): + # https://github.com/pandas-dev/pandas/pull/60046 + result = np.asarray(arr) + tm.assert_numpy_array_equal(result, arr_data) + + # it always gives a copy by default + result_copy1 = np.asarray(arr) + result_copy2 = np.asarray(arr) + assert not np.may_share_memory(result_copy1, result_copy2) + + # or with explicit copy=True + result_copy1 = np.array(arr, copy=True) + result_copy2 = np.array(arr, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + # for sparse arrays, copy=False is never allowed + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + np.array(arr, copy=False) + + # except when there are actually no sparse filled values + arr2 = SparseArray(np.array([1, 2, 3])) + result_nocopy1 = np.array(arr2, copy=False) + result_nocopy2 = np.array(arr2, copy=False) + assert np.may_share_memory(result_nocopy1, result_nocopy2) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a18161f47039b..336a0fef69170 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -10,7 +10,10 @@ from pandas._config import using_string_dtype -from pandas.compat.pyarrow import pa_version_under12p0 +from pandas.compat.pyarrow import ( + pa_version_under12p0, + pa_version_under19p0, +) from pandas.core.dtypes.common import is_dtype_equal @@ -109,14 +112,11 @@ def test_none_to_nan(cls, dtype): def test_setitem_validates(cls, dtype): arr = cls._from_sequence(["a", "b"], dtype=dtype) - if dtype.storage == "python": - msg = "Cannot set non-string value '10' into a StringArray." - else: - msg = "Scalar must be NA or str" + msg = "Invalid value '10' for dtype 'str" with pytest.raises(TypeError, match=msg): arr[0] = 10 - msg = "Must provide strings" + msg = "Invalid value for dtype 'str" with pytest.raises(TypeError, match=msg): arr[:] = np.array([1, 2]) @@ -508,10 +508,7 @@ def test_fillna_args(dtype): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - if dtype.storage == "pyarrow": - msg = "Invalid value '1' for dtype str" - else: - msg = "Cannot set non-string value '1' into a StringArray." + msg = "Invalid value '1' for dtype 'str" with pytest.raises(TypeError, match=msg): arr.fillna(value=1) @@ -545,7 +542,7 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert table.field("a").type == "large_string" with pd.option_context("string_storage", string_storage): result = table.to_pandas() - if dtype.na_value is np.nan and not using_string_dtype(): + if dtype.na_value is np.nan and not using_infer_string: assert result["a"].dtype == "object" else: assert isinstance(result["a"].dtype, pd.StringDtype) @@ -559,6 +556,21 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert result.loc[2, "a"] is result["a"].dtype.na_value +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_arrow_from_string(using_infer_string): + # not roundtrip, but starting with pyarrow table without pandas metadata + pa = pytest.importorskip("pyarrow") + table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())}) + + result = table.to_pandas() + + if using_infer_string and not pa_version_under19p0: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str") + else: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object") + tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 @@ -727,10 +739,7 @@ def test_setitem_scalar_with_mask_validation(dtype): # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) - if dtype.storage == "python": - msg = "Cannot set non-string value" - else: - msg = "Scalar must be NA or str" + msg = "Invalid value '1' for dtype 'str" with pytest.raises(TypeError, match=msg): ser[mask] = 1 @@ -749,3 +758,9 @@ def test_tolist(dtype): result = arr.tolist() expected = vals tm.assert_equal(result, expected) + + +def test_string_array_view_type_error(): + arr = pd.array(["a", "b", "c"], dtype="string") + with pytest.raises(TypeError, match="Cannot change data-type for string array."): + arr.view("i8") diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index d4363171788d4..e6103da5021bb 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -99,6 +99,20 @@ def test_constructor_valid_string_type_value_dictionary(string_type, chunked): assert pa.types.is_large_string(arr._pa_array.type) +@pytest.mark.parametrize("chunked", [True, False]) +def test_constructor_valid_string_view(chunked): + # requires pyarrow>=18 for casting string_view to string + pa = pytest.importorskip("pyarrow", minversion="18") + + arr = pa.array(["1", "2", "3"], pa.string_view()) + if chunked: + arr = pa.chunked_array(arr) + + arr = ArrowStringArray(arr) + # dictionary type get converted to dense large string array + assert pa.types.is_large_string(arr._pa_array.type) + + def test_constructor_from_list(): # GH#27673 pytest.importorskip("pyarrow") diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 0c8eefab95464..d1ef29b0bf8a0 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1152,9 +1152,17 @@ def test_array_interface(self, arr1d): result = np.asarray(arr, dtype=object) tm.assert_numpy_array_equal(result, expected) + # to int64 gives the underlying representation result = np.asarray(arr, dtype="int64") tm.assert_numpy_array_equal(result, arr.asi8) + result2 = np.asarray(arr, dtype="int64") + assert np.may_share_memory(result, result2) + + result_copy1 = np.array(arr, dtype="int64", copy=True) + result_copy2 = np.array(arr, dtype="int64", copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + # to other dtypes msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index c4b02423f8cf0..dffd2009ef373 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -179,3 +179,14 @@ def test_constructor_datetime_nonns(self, constructor): arr.flags.writeable = False result = constructor(arr) tm.assert_equal(result, expected) + + def test_constructor_from_dict_keys(self, constructor, using_infer_string): + # https://github.com/pandas-dev/pandas/issues/60343 + d = {"a": 1, "b": 2} + result = constructor(d.keys(), dtype="str") + if using_infer_string: + assert result.dtype == "str" + else: + assert result.dtype == "object" + expected = constructor(list(d.keys()), dtype="str") + tm.assert_equal(result, expected) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index d8af7abe83084..e3a821519c638 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -1,9 +1,8 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import HAS_PYARROW +from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -297,24 +296,27 @@ def test_array_multiindex_raises(): @pytest.mark.parametrize( - "arr, expected", + "arr, expected, zero_copy", [ - (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), - (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), + (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64), True), + (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object), False), ( pd.core.arrays.period_array(["2000", "2001"], freq="D"), np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), + False, ), - (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])), + (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan]), False), ( IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), + False, ), - (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64), False), # tz-naive datetime ( DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")), np.array(["2000", "2001"], dtype="M8[ns]"), + True, ), # tz-aware stays tz`-aware ( @@ -329,6 +331,7 @@ def test_array_multiindex_raises(): Timestamp("2000-01-02", tz="US/Central"), ] ), + False, ), # Timedelta ( @@ -337,6 +340,7 @@ def test_array_multiindex_raises(): dtype=np.dtype("m8[ns]"), ), np.array([0, 3600000000000], dtype="m8[ns]"), + True, ), # GH#26406 tz is preserved in Categorical[dt64tz] ( @@ -347,10 +351,11 @@ def test_array_multiindex_raises(): Timestamp("2016-01-02", tz="US/Pacific"), ] ), + False, ), ], ) -def test_to_numpy(arr, expected, index_or_series_or_array, request): +def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array): box = index_or_series_or_array with tm.assert_produces_warning(None): @@ -362,10 +367,29 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): result = np.asarray(thing) tm.assert_numpy_array_equal(result, expected) + # Additionally, we check the `copy=` semantics for array/asarray + # (these are implemented by us via `__array__`). + result_cp1 = np.array(thing, copy=True) + result_cp2 = np.array(thing, copy=True) + # When called with `copy=True` NumPy/we should ensure a copy was made + assert not np.may_share_memory(result_cp1, result_cp2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + if not zero_copy: + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + # An error is always acceptable for `copy=False` + np.array(thing, copy=False) + + else: + result_nocopy1 = np.array(thing, copy=False) + result_nocopy2 = np.array(thing, copy=False) + # If copy=False was given, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize("as_series", [True, False]) @pytest.mark.parametrize( "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] @@ -377,13 +401,13 @@ def test_to_numpy_copy(arr, as_series, using_infer_string): # no copy by default result = obj.to_numpy() - if using_infer_string and arr.dtype == object: + if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow": assert np.shares_memory(arr, result) is False else: assert np.shares_memory(arr, result) is True result = obj.to_numpy(copy=False) - if using_infer_string and arr.dtype == object: + if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow": assert np.shares_memory(arr, result) is False else: assert np.shares_memory(arr, result) is True diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index bb238d08bd9bd..2b3ef9201d918 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas import ( DataFrame, Series, @@ -15,8 +17,12 @@ @pytest.mark.parametrize( "method", - [lambda ser: ser.values, lambda ser: np.asarray(ser)], - ids=["values", "asarray"], + [ + lambda ser: ser.values, + lambda ser: np.asarray(ser), + lambda ser: np.array(ser, copy=False), + ], + ids=["values", "asarray", "array"], ) def test_series_values(method): ser = Series([1, 2, 3], name="name") @@ -40,8 +46,12 @@ def test_series_values(method): @pytest.mark.parametrize( "method", - [lambda df: df.values, lambda df: np.asarray(df)], - ids=["values", "asarray"], + [ + lambda df: df.values, + lambda df: np.asarray(df), + lambda ser: np.array(ser, copy=False), + ], + ids=["values", "asarray", "array"], ) def test_dataframe_values(method): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -82,7 +92,7 @@ def test_series_to_numpy(): ser.iloc[0] = 0 assert ser.values[0] == 0 - # specify copy=False gives a writeable array + # specify copy=True gives a writeable array ser = Series([1, 2, 3], name="name") arr = ser.to_numpy(copy=True) assert not np.shares_memory(arr, get_array(ser, "name")) @@ -130,6 +140,23 @@ def test_dataframe_multiple_numpy_dtypes(): assert not np.shares_memory(arr, get_array(df, "a")) assert arr.flags.writeable is True + if np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + arr = np.array(df, copy=False) + + arr = np.array(df, copy=True) + assert arr.flags.writeable is True + + +def test_dataframe_single_block_copy_true(): + # the copy=False/None cases are tested above in test_dataframe_values + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + arr = np.array(df, copy=True) + assert not np.shares_memory(arr, get_array(df, "a")) + assert arr.flags.writeable is True + def test_values_is_ea(): df = DataFrame({"a": date_range("2012-01-01", periods=3)}) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 80c30f2d0c26e..91f5badeb9728 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 @@ -206,7 +204,6 @@ def test_astype_arrow_timestamp(): assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_convert_dtypes_infer_objects(): ser = Series(["a", "b", "c"]) ser_orig = ser.copy() @@ -217,20 +214,25 @@ def test_convert_dtypes_infer_objects(): convert_string=False, ) - assert np.shares_memory(get_array(ser), get_array(result)) + assert tm.shares_memory(get_array(ser), get_array(result)) result.iloc[0] = "x" tm.assert_series_equal(ser, ser_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") -def test_convert_dtypes(): +def test_convert_dtypes(using_infer_string): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() df2 = df.convert_dtypes() - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(df2, "d"), get_array(df, "d")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + if using_infer_string and HAS_PYARROW: + # TODO the default nullable string dtype still uses python storage + # this should be changed to pyarrow if installed + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d")) + assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c")) df2.iloc[0, 0] = "x" + df2.iloc[0, 1] = 10 tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index fcdece6077829..32fea794975b6 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -153,7 +153,6 @@ def test_concat_copy_keyword(): assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize( "func", [ diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index fc57178b897b9..6bcda0ef2c35a 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas import ( NA, DataFrame, @@ -114,18 +110,18 @@ def test_interp_fill_functions_inplace(func, dtype): assert view._mgr._has_no_reference(0) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_interpolate_cannot_with_object_dtype(): df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) + df["a"] = df["a"].astype(object) msg = "DataFrame cannot interpolate with object dtype" with pytest.raises(TypeError, match=msg): df.interpolate() -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_interpolate_object_convert_no_op(): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) + df["a"] = df["a"].astype(object) arr_a = get_array(df, "a") # Now CoW makes a copy, it should not! diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 92e1ba750fae2..250697c91ff13 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import HAS_PYARROW import pandas as pd @@ -716,14 +714,18 @@ def test_head_tail(method): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") -def test_infer_objects(): - df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) +def test_infer_objects(using_infer_string): + df = DataFrame( + {"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"} + ) df_orig = df.copy() df2 = df.infer_objects() assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + if using_infer_string and HAS_PYARROW: + assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) + else: + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) df2.iloc[0, 0] = 0 df2.iloc[0, 1] = "d" @@ -732,19 +734,16 @@ def test_infer_objects(): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) -def test_infer_objects_no_reference(): +def test_infer_objects_no_reference(using_infer_string): df = DataFrame( { "a": [1, 2], - "b": "c", + "b": Series(["x", "y"], dtype=object), "c": 1, "d": Series( [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" ), - "e": "b", + "e": Series(["z", "w"], dtype=object), } ) df = df.infer_objects() @@ -757,8 +756,14 @@ def test_infer_objects_no_reference(): df.iloc[0, 1] = "d" df.iloc[0, 3] = Timestamp("2018-12-31") assert np.shares_memory(arr_a, get_array(df, "a")) - # TODO(CoW): Block splitting causes references here - assert not np.shares_memory(arr_b, get_array(df, "b")) + if using_infer_string and HAS_PYARROW: + # note that the underlying memory of arr_b has been copied anyway + # because of the assignment, but the EA is updated inplace so still + # appears the share memory + assert tm.shares_memory(arr_b, get_array(df, "b")) + else: + # TODO(CoW): Block splitting causes references here + assert not np.shares_memory(arr_b, get_array(df, "b")) assert np.shares_memory(arr_d, get_array(df, "d")) @@ -766,7 +771,7 @@ def test_infer_objects_reference(): df = DataFrame( { "a": [1, 2], - "b": "c", + "b": Series(["x", "y"], dtype=object), "c": 1, "d": Series( [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" @@ -904,14 +909,13 @@ def test_sort_values_inplace(obj, kwargs): tm.assert_equal(view, obj_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize("decimals", [-1, 0, 1]) def test_round(decimals): df = DataFrame({"a": [1, 2], "b": "c"}) df_orig = df.copy() df2 = df.round(decimals=decimals) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) # TODO: Make inplace by using out parameter of ndarray.round? if decimals >= 0: # Ensure lazy copy if no-op diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index a8acd446ff5f5..d4838a5e68ab8 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas import ( Categorical, DataFrame, @@ -13,7 +9,6 @@ from pandas.tests.copy_view.util import get_array -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "replace_kwargs", [ @@ -30,14 +25,14 @@ ], ) def test_replace(replace_kwargs): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df_replaced = df.replace(**replace_kwargs) if (df_replaced["b"] == df["b"]).all(): assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) # mutating squeezed df triggers a copy-on-write for that column/block df_replaced.loc[0, "c"] = -1 @@ -61,18 +56,17 @@ def test_replace_regex_inplace_refs(): tm.assert_frame_equal(view, df_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_replace_regex_inplace(): df = DataFrame({"a": ["aaa", "bbb"]}) arr = get_array(df, "a") df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) assert df._mgr._has_no_reference(0) - assert np.shares_memory(arr, get_array(df, "a")) + assert tm.shares_memory(arr, get_array(df, "a")) df_orig = df.copy() df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True) tm.assert_frame_equal(df_orig, df) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) def test_replace_regex_inplace_no_op(): @@ -259,10 +253,9 @@ def test_replace_empty_list(): assert not df2._mgr._has_no_reference(0) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize("value", ["d", None]) def test_replace_object_list_inplace(value): - df = DataFrame({"a": ["a", "b", "c"]}) + df = DataFrame({"a": ["a", "b", "c"]}, dtype=object) arr = get_array(df, "a") df.replace(["c"], value, inplace=True) assert np.shares_memory(arr, get_array(df, "a")) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index e338fb1331734..fa48393dd183e 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -835,3 +835,30 @@ def test_pandas_dtype_string_dtypes(string_storage): with pd.option_context("string_storage", string_storage): result = pandas_dtype("string") assert result == pd.StringDtype(string_storage, na_value=pd.NA) + + +def test_pandas_dtype_string_dtype_alias_with_storage(): + with pytest.raises(TypeError, match="not understood"): + pandas_dtype("str[python]") + + with pytest.raises(TypeError, match="not understood"): + pandas_dtype("str[pyarrow]") + + result = pandas_dtype("string[python]") + assert result == pd.StringDtype("python", na_value=pd.NA) + + if HAS_PYARROW: + result = pandas_dtype("string[pyarrow]") + assert result == pd.StringDtype("pyarrow", na_value=pd.NA) + else: + with pytest.raises( + ImportError, match="required for PyArrow backed StringArray" + ): + pandas_dtype("string[pyarrow]") + + +@td.skip_if_installed("pyarrow") +def test_construct_from_string_without_pyarrow_installed(): + # GH 57928 + with pytest.raises(ImportError, match="pyarrow>=10.0.1 is required"): + pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]") diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index da444b55490f0..db98751324ebc 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1582,6 +1582,31 @@ def test_is_string_array(self): ) assert not lib.is_string_array(np.array([1, 2])) + @pytest.mark.parametrize( + "func", + [ + "is_bool_array", + "is_date_array", + "is_datetime_array", + "is_datetime64_array", + "is_float_array", + "is_integer_array", + "is_interval_array", + "is_string_array", + "is_time_array", + "is_timedelta_or_timedelta64_array", + ], + ) + def test_is_dtype_array_empty_obj(self, func): + # https://github.com/pandas-dev/pandas/pull/60796 + func = getattr(lib, func) + + arr = np.empty((2, 0), dtype=object) + assert not func(arr) + + arr = np.empty((0, 2), dtype=object) + assert not func(arr) + def test_to_object_array_tuples(self): r = (5, 6) values = [r] diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 9a41a3a582c4a..9a2f186c2a00b 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -18,8 +18,9 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): try: alt = ser.astype("float64") - except TypeError: - # e.g. Period can't be cast to float64 + except (TypeError, ValueError): + # e.g. Period can't be cast to float64 (TypeError) + # String can't be cast to float64 (ValueError) alt = ser.astype(object) result = getattr(ser, op_name)(skipna=skipna) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index bab8566a06dc2..60cade97ab528 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -113,13 +113,9 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 6683c87e2b8fc..79eb64b5a654f 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_extension_array_dtype from pandas.core.dtypes.dtypes import ExtensionDtype @@ -71,6 +73,25 @@ def test_array_interface(self, data): expected = construct_1d_object_array_from_listlike(list(data)) tm.assert_numpy_array_equal(result, expected) + def test_array_interface_copy(self, data): + result_copy1 = np.array(data, copy=True) + result_copy2 = np.array(data, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + try: + result_nocopy1 = np.array(data, copy=False) + except ValueError: + # An error is always acceptable for `copy=False` + return + + result_nocopy2 = np.array(data, copy=False) + # If copy=False was given and did not raise, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + def test_is_extension_array_dtype(self, data): assert is_extension_array_dtype(data) assert is_extension_array_dtype(data.dtype) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 547114ecfddd0..222ff42d45052 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -5,10 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.common import is_string_dtype import pandas as pd @@ -134,12 +130,6 @@ class BaseArithmeticOpsTests(BaseOpsUtil): series_array_exc: type[Exception] | None = TypeError divmod_exc: type[Exception] | None = TypeError - # TODO(infer_string) need to remove import of pyarrow - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): @@ -149,11 +139,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): ser = pd.Series(data) self.check_opname(ser, op_name, ser.iloc[0]) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): @@ -163,22 +148,12 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): df = pd.DataFrame({"A": data}) self.check_opname(df, op_name, data[0]) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op_name = all_arithmetic_operators ser = pd.Series(data) self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser))) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_divmod(self, data): ser = pd.Series(data) self._check_divmod_op(ser, divmod, 1) @@ -194,7 +169,6 @@ def test_divmod_series_array(self, data, data_for_twos): other = pd.Series(other) self._check_divmod_op(other, ops.rdivmod, ser) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_add_series_with_extension_array(self, data): # Check adding an ExtensionArray to a Series of the same dtype matches # the behavior of adding the arrays directly and then wrapping in a diff --git a/pandas/tests/extension/decimal/__init__.py b/pandas/tests/extension/decimal/__init__.py index 34727b43a7b0f..47b1c7c57a47a 100644 --- a/pandas/tests/extension/decimal/__init__.py +++ b/pandas/tests/extension/decimal/__init__.py @@ -5,4 +5,4 @@ to_decimal, ) -__all__ = ["DecimalArray", "DecimalDtype", "to_decimal", "make_data"] +__all__ = ["DecimalArray", "DecimalDtype", "make_data", "to_decimal"] diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 4fa48023fbc95..a68c8a06e1d18 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -148,12 +148,20 @@ def __ne__(self, other): return NotImplemented def __array__(self, dtype=None, copy=None): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if dtype is None: dtype = object if dtype == object: # on py38 builds it looks like numpy is inferring to a non-1D array return construct_1d_object_array_from_listlike(list(self)) - return np.asarray(self.data, dtype=dtype) + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.asarray(self.data, dtype=dtype) + return np.asarray(self.data, dtype=dtype, copy=copy) @property def nbytes(self) -> int: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f0ff11e5fa3f7..4fccf02e08bd6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -393,13 +393,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: # attribute "pyarrow_dtype" pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr] - if ( - pa.types.is_string(pa_type) - or pa.types.is_binary(pa_type) - or pa.types.is_decimal(pa_type) - ): + if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type): if op_name in ["cumsum", "cumprod", "cummax", "cummin"]: return False + elif pa.types.is_string(pa_type): + if op_name == "cumprod": + return False elif pa.types.is_boolean(pa_type): if op_name in ["cumprod", "cummax", "cummin"]: return False @@ -414,6 +413,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request): pa_type = data.dtype.pyarrow_dtype op_name = all_numeric_accumulations + + if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]: + # https://github.com/pandas-dev/pandas/pull/60633 + # Doesn't fit test structure, tested in series/test_cumulative.py instead. + return + ser = pd.Series(data) if not self._supports_accumulation(ser, op_name): @@ -441,7 +446,7 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques request.applymarker( pytest.mark.xfail( reason=f"{all_numeric_accumulations} not implemented for {pa_type}", - raises=NotImplementedError, + raises=TypeError, ) ) @@ -896,9 +901,7 @@ def _is_temporal_supported(self, opname, pa_dtype): ) ) and pa.types.is_duration(pa_dtype) - or opname in ("__sub__", "__rsub__") - and pa.types.is_temporal(pa_dtype) - ) + ) or (opname in ("__sub__", "__rsub__") and pa.types.is_temporal(pa_dtype)) def _get_expected_exception( self, op_name: str, obj, other @@ -1649,7 +1652,7 @@ def test_from_arrow_respecting_given_dtype(): def test_from_arrow_respecting_given_dtype_unsafe(): array = pa.array([1.5, 2.5], type=pa.float64()) - with pytest.raises(pa.ArrowInvalid, match="Float value 1.5 was truncated"): + with tm.external_error_raised(pa.ArrowInvalid): array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get) @@ -3453,7 +3456,9 @@ def test_string_to_datetime_parsing_cast(): ) def test_interpolate_not_numeric(data): if not data.dtype._is_numeric: - with pytest.raises(ValueError, match="Values must be numeric."): + ser = pd.Series(data) + msg = re.escape(f"Cannot interpolate with {ser.dtype} dtype") + with pytest.raises(TypeError, match=msg): pd.Series(data).interpolate() diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index c3d4b83f731a3..8f8af607585df 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -140,7 +140,6 @@ def test_map(self, data, na_action): result = data.map(lambda x: x, na_action=na_action) tm.assert_extension_array_equal(result, data) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): # frame & scalar op_name = all_arithmetic_operators @@ -152,7 +151,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): ) super().test_arith_frame_with_scalar(data, op_name) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): op_name = all_arithmetic_operators if op_name == "__rmod__": diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index ec979ac6d22dc..011bf0b2016b2 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -101,6 +101,31 @@ def test_fillna_limit_series(self, data_missing): def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object(self, data): + super().test_hash_pandas_object(data) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object_works(self, data, as_frame): + super().test_hash_pandas_object_works(data, as_frame) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_str(self, data): + super().test_astype_str(data) + # TODO: either belongs in tests.arrays.interval or move into base tests. def test_fillna_non_scalar_raises(data_missing): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 1b251a5118681..79cfb736941d6 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -19,8 +19,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.dtypes import NumpyEADtype import pandas as pd @@ -257,7 +255,6 @@ def test_insert_invalid(self, data, invalid_scalar): frame_scalar_exc = None series_array_exc = None - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_divmod(self, data): divmod_exc = None if data.dtype.kind == "O": @@ -265,7 +262,6 @@ def test_divmod(self, data): self.divmod_exc = divmod_exc super().test_divmod(data) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_divmod_series_array(self, data): ser = pd.Series(data) exc = None @@ -274,7 +270,6 @@ def test_divmod_series_array(self, data): self.divmod_exc = exc self._check_divmod_op(ser, divmod, data) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): opname = all_arithmetic_operators series_scalar_exc = None @@ -288,7 +283,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request) self.series_scalar_exc = series_scalar_exc super().test_arith_series_with_scalar(data, all_arithmetic_operators) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_array(self, data, all_arithmetic_operators): opname = all_arithmetic_operators series_array_exc = None @@ -297,7 +291,6 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): self.series_array_exc = series_array_exc super().test_arith_series_with_array(data, all_arithmetic_operators) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): opname = all_arithmetic_operators frame_scalar_exc = None diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 27621193a9b8d..6ce48e434d329 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -24,6 +24,8 @@ from pandas.compat import HAS_PYARROW +from pandas.core.dtypes.base import StorageExtensionDtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype @@ -187,12 +189,19 @@ def _get_expected_exception( return None def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: - return ( - op_name in ["min", "max", "sum"] - or ser.dtype.na_value is np.nan # type: ignore[union-attr] + return op_name in ["min", "max", "sum"] or ( + ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) + def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: + assert isinstance(ser.dtype, StorageExtensionDtype) + return ser.dtype.storage == "pyarrow" and op_name in [ + "cummin", + "cummax", + "cumsum", + ] + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 8da7ac635f293..b3140bad8276b 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -18,7 +18,7 @@ def datetime_frame() -> DataFrame: """ return DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -33,7 +33,7 @@ def float_string_frame(): df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), index=Index([f"foo_{i}" for i in range(30)], dtype=object), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), ) df["foo"] = "bar" return df diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index fc7c03dc25839..1509c47ba65c7 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -108,7 +108,6 @@ def test_constructor_list_of_series(self): expected = DataFrame.from_dict(sdict, orient="index") tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_orient(self, float_string_frame): data_dict = float_string_frame.T._series recons = DataFrame.from_dict(data_dict, orient="index") diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py index cb1cbd68ede63..1a454351b7085 100644 --- a/pandas/tests/frame/indexing/test_coercion.py +++ b/pandas/tests/frame/indexing/test_coercion.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -84,14 +82,18 @@ def test_6942(indexer_al): assert df.iloc[0, 0] == t2 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_26395(indexer_al): # .at case fixed by GH#45121 (best guess) df = DataFrame(index=["A", "B", "C"]) df["D"] = 0 indexer_al(df)["C", "D"] = 2 - expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64) + expected = DataFrame( + {"D": [0, 0, 2]}, + index=["A", "B", "C"], + columns=pd.Index(["D"], dtype=object), + dtype=np.int64, + ) tm.assert_frame_equal(df, expected) with pytest.raises(TypeError, match="Invalid value"): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 0723c3c70091c..a9bc485283985 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -9,10 +9,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import iNaT -from pandas.compat import HAS_PYARROW from pandas.errors import InvalidIndexError from pandas.core.dtypes.common import is_integer @@ -177,7 +174,6 @@ def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_fram if bif[c].dtype != bifw[c].dtype: assert bif[c].dtype == df[c].dtype - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_getitem_boolean_casting(self, datetime_frame): # don't upcast if we don't need to df = datetime_frame.copy() @@ -505,18 +501,16 @@ def test_setitem_ambig(self, using_infer_string): else: assert dm[2].dtype == np.object_ - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_setitem_None(self, float_frame, using_infer_string): + def test_setitem_None(self, float_frame): # GH #766 float_frame[None] = float_frame["A"] - key = None if not using_infer_string else np.nan tm.assert_series_equal( float_frame.iloc[:, -1], float_frame["A"], check_names=False ) tm.assert_series_equal( - float_frame.loc[:, key], float_frame["A"], check_names=False + float_frame.loc[:, None], float_frame["A"], check_names=False ) - tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False) + tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False) def test_loc_setitem_boolean_mask_allfalse(self): # GH 9596 @@ -1126,7 +1120,6 @@ def test_setitem_with_unaligned_tz_aware_datetime_column(self): df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]] tm.assert_series_equal(df["dates"], column) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_loc_setitem_datetimelike_with_inference(self): # GH 7592 # assignment of timedeltas with NaT @@ -1145,13 +1138,10 @@ def test_loc_setitem_datetimelike_with_inference(self): result = df.dtypes expected = Series( [np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2, - index=list("ABCDEFGH"), + index=Index(list("ABCDEFGH"), dtype=object), ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" - ) def test_getitem_boolean_indexing_mixed(self): df = DataFrame( { @@ -1193,7 +1183,7 @@ def test_getitem_boolean_indexing_mixed(self): tm.assert_frame_equal(df2, expected) df["foo"] = "test" - msg = "not supported between instances|unorderable types" + msg = "not supported between instances|unorderable types|Invalid comparison" with pytest.raises(TypeError, match=msg): df[df > 0.3] = 1 @@ -1281,7 +1271,7 @@ def test_setting_mismatched_na_into_nullable_fails( r"timedelta64\[ns\] cannot be converted to (Floating|Integer)Dtype", r"datetime64\[ns\] cannot be converted to (Floating|Integer)Dtype", "'values' contains non-numeric NA", - r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}", + r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'", ] ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 3dd8f7196c594..a1d60eb9626d6 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import PerformanceWarning from pandas import ( @@ -63,7 +61,6 @@ def test_insert_column_bug_4032(self): expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_insert_with_columns_dups(self): # GH#14291 df = DataFrame() @@ -71,7 +68,8 @@ def test_insert_with_columns_dups(self): df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) exp = DataFrame( - [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] + [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], + columns=Index(["A", "A", "A"], dtype=object), ) tm.assert_frame_equal(df, exp) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index cb971b31c13c4..cfd7e91c4ceab 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.base import _registry as ea_registry from pandas.core.dtypes.common import is_object_dtype from pandas.core.dtypes.dtypes import ( @@ -146,13 +144,16 @@ def test_setitem_different_dtype(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_empty_columns(self): # GH 13522 df = DataFrame(index=["A", "B", "C"]) df["X"] = df.index df["X"] = ["x", "y", "z"] - exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"]) + exp = DataFrame( + data={"X": ["x", "y", "z"]}, + index=["A", "B", "C"], + columns=Index(["X"], dtype=object), + ) tm.assert_frame_equal(df, exp) def test_setitem_dt64_index_empty_columns(self): @@ -162,14 +163,15 @@ def test_setitem_dt64_index_empty_columns(self): df["A"] = rng assert df["A"].dtype == np.dtype("M8[ns]") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_timestamp_empty_columns(self): # GH#19843 df = DataFrame(index=range(3)) df["now"] = Timestamp("20130101", tz="UTC") expected = DataFrame( - [[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"] + [[Timestamp("20130101", tz="UTC")]] * 3, + index=range(3), + columns=Index(["now"], dtype=object), ) tm.assert_frame_equal(df, expected) @@ -202,14 +204,13 @@ def test_setitem_with_unaligned_sparse_value(self): expected = Series(SparseArray([1, 0, 0]), name="new_column") tm.assert_series_equal(df["new_column"], expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_period_preserves_dtype(self): # GH: 26861 data = [Period("2003-12", "D")] result = DataFrame([]) result["a"] = data - expected = DataFrame({"a": data}) + expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object)) tm.assert_frame_equal(result, expected) @@ -672,11 +673,10 @@ def test_setitem_iloc_two_dimensional_generator(self): expected = DataFrame({"a": [1, 2, 3], "b": [4, 1, 1]}) tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_dtypes_bytes_type_to_object(self): # GH 20734 index = Series(name="id", dtype="S24") - df = DataFrame(index=index) + df = DataFrame(index=index, columns=Index([], dtype="str")) df["a"] = Series(name="a", index=index, dtype=np.uint32) df["b"] = Series(name="b", index=index, dtype="S64") df["c"] = Series(name="c", index=index, dtype="S64") @@ -705,7 +705,6 @@ def test_setitem_ea_dtype_rhs_series(self): expected = DataFrame({"a": [1, 2]}, dtype="Int64") tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_npmatrix_2d(self): # GH#42376 # for use-case df["x"] = sparse.random((10, 10)).mean(axis=1) @@ -714,7 +713,7 @@ def test_setitem_npmatrix_2d(self): ) a = np.ones((10, 1)) - df = DataFrame(index=np.arange(10)) + df = DataFrame(index=np.arange(10), columns=Index([], dtype="str")) df["np-array"] = a # Instantiation of `np.matrix` gives PendingDeprecationWarning @@ -927,12 +926,11 @@ def test_setitem_with_expansion_categorical_dtype(self): ser.name = "E" tm.assert_series_equal(result2.sort_index(), ser.sort_index()) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_scalars_no_index(self): # GH#16823 / GH#17894 df = DataFrame() df["foo"] = 1 - expected = DataFrame(columns=["foo"]).astype(np.int64) + expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64) tm.assert_frame_equal(df, expected) def test_setitem_newcol_tuple_key(self, float_frame): diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 32a827c25c77a..d6570fcda2ee8 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -4,10 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -50,7 +46,6 @@ def is_ok(s): class TestDataFrameIndexingWhere: - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_get(self, where_frame, float_string_frame): def _check_get(df, cond, check_dtypes=True): other1 = _safe_add(df) @@ -68,7 +63,10 @@ def _check_get(df, cond, check_dtypes=True): # check getting df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return @@ -101,7 +99,6 @@ def test_where_upcasting(self): tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_alignment(self, where_frame, float_string_frame): # aligning def _check_align(df, cond, other, check_dtypes=True): @@ -133,7 +130,10 @@ def _check_align(df, cond, other, check_dtypes=True): df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return @@ -176,7 +176,6 @@ def test_where_invalid(self): with pytest.raises(ValueError, match=msg): df.mask(0) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_set(self, where_frame, float_string_frame, mixed_int_frame): # where inplace @@ -198,7 +197,10 @@ def _check_set(df, cond, check_dtypes=True): df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return @@ -929,7 +931,7 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): mask = np.array([True, True, False], ndmin=obj.ndim).T - msg = r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}" + msg = r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'" for null in tm.NP_NAT_OBJECTS + [pd.NaT]: # NaT is an NA value that we should *not* cast to pd.NA dtype @@ -940,9 +942,6 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): obj.mask(mask, null) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) @given(data=OPTIONAL_ONE_OF_ALL) def test_where_inplace_casting(data): # GH 22051 @@ -1023,19 +1022,12 @@ def test_where_producing_ea_cond_for_np_dtype(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) -def test_where_int_overflow(replacement, using_infer_string, request): +def test_where_int_overflow(replacement): # GH 31687 df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) - if using_infer_string and replacement not in (None, "snake"): - request.node.add_marker( - pytest.mark.xfail(reason="Can't set non-string into string column") - ) result = df.where(pd.notnull(df), replacement) expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index a01b68f1fea2a..54733129b4d47 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -74,10 +72,9 @@ def test_xs_other(self, float_frame): tm.assert_series_equal(float_frame["A"], float_frame_orig["A"]) assert not (expected == 5).all() - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_xs_corner(self): # pathological mixed-type reordering case - df = DataFrame(index=[0]) + df = DataFrame(index=[0], columns=Index([], dtype="str")) df["A"] = 1.0 df["B"] = "foo" df["C"] = 2.0 diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index ab3743283ea13..eb1ee4e7b2970 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -745,7 +745,7 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) GH#60639") def test_astype_dt64_to_string( self, frame_or_series, tz_naive_fixture, using_infer_string ): diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 87b7d5052a345..1e594043510ea 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import is_dtype_equal @@ -32,8 +30,7 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_combine_first(self, float_frame, using_infer_string): + def test_combine_first(self, float_frame): # disjoint head, tail = float_frame[:5], float_frame[5:] @@ -79,9 +76,7 @@ def test_combine_first(self, float_frame, using_infer_string): tm.assert_series_equal(combined["A"].reindex(g.index), g["A"]) # corner cases - warning = FutureWarning if using_infer_string else None - with tm.assert_produces_warning(warning, match="empty entries"): - comb = float_frame.combine_first(DataFrame()) + comb = float_frame.combine_first(DataFrame()) tm.assert_frame_equal(comb, float_frame) comb = DataFrame().combine_first(float_frame) @@ -385,7 +380,7 @@ def test_combine_first_with_asymmetric_other(self, val): df2 = DataFrame({"isBool": [True]}) res = df1.combine_first(df2) - exp = DataFrame({"isBool": [True], "isNum": [val]}) + exp = DataFrame({"isNum": [val], "isBool": [True]}) tm.assert_frame_equal(res, exp) @@ -560,3 +555,13 @@ def test_combine_first_empty_columns(): result = left.combine_first(right) expected = DataFrame(columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected) + + +def test_combine_first_preserve_column_order(): + # GH#60427 + df1 = DataFrame({"B": [1, 2, 3], "A": [4, None, 6]}) + df2 = DataFrame({"A": [5]}, index=[1]) + + result = df1.combine_first(df2) + expected = DataFrame({"B": [1, 2, 3], "A": [4.0, 5.0, 6.0]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index c15952339ef18..d5e94382b8314 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -320,7 +318,6 @@ def test_corrwith_non_timeseries_data(self): for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_corrwith_with_objects(self, using_infer_string): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -334,9 +331,8 @@ def test_corrwith_with_objects(self, using_infer_string): df2["obj"] = "bar" if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + msg = "Cannot perform reduction 'mean' with string dtype" + with pytest.raises(TypeError, match=msg): df1.corrwith(df2) else: with pytest.raises(TypeError, match="Could not convert"): diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 4a60dc09cfe07..d4f5629e6ba4b 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -184,10 +182,12 @@ def test_dropna_multiple_axes(self): with pytest.raises(TypeError, match="supplying multiple axes"): inp.dropna(how="all", axis=(0, 1), inplace=True) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_dropna_tz_aware_datetime(self): + def test_dropna_tz_aware_datetime(self, using_infer_string): # GH13407 + df = DataFrame() + if using_infer_string: + df.columns = df.columns.astype("str") dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc()) dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc()) df["Time"] = [dt1] diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index 1685f9ee331f5..bf01ec73cf72b 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -135,13 +133,9 @@ def test_dtypes_timedeltas(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) result = df.apply(lambda col: np.array("bar")) - if using_infer_string: - expected = Series([np.array(["bar"])]) - else: - expected = Series(["bar"]) + expected = Series(np.array("bar")) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index ad1a37916e381..67d1d45af1cb3 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( Categorical, DataFrame, @@ -65,15 +63,20 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(TypeError, match=msg): datetime_frame.fillna() - # TODO(infer_string) test as actual error instead of xfail - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") - def test_fillna_mixed_type(self, float_string_frame): + def test_fillna_mixed_type(self, float_string_frame, using_infer_string): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan mf.loc[mf.index[-10:], "A"] = np.nan - # TODO: make stronger assertion here, GH 25640 - mf.fillna(value=0) - mf.ffill() + + result = mf.ffill() + assert ( + result.loc[result.index[-10:], "A"] == result.loc[result.index[-11], "A"] + ).all() + assert (result.loc[result.index[5:20], "foo"] == "bar").all() + + result = mf.fillna(value=0) + assert (result.loc[result.index[-10:], "A"] == 0).all() + assert (result.loc[result.index[5:20], "foo"] == 0).all() def test_fillna_mixed_float(self, mixed_float_frame): # mixed numeric (but no float16) @@ -84,28 +87,21 @@ def test_fillna_mixed_float(self, mixed_float_frame): result = mf.ffill() _check_mixed_float(result, dtype={"C": None}) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_fillna_different_dtype(self, using_infer_string): + def test_fillna_different_dtype(self): # with different dtype (GH#3386) df = DataFrame( [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna({2: "foo"}) - else: - result = df.fillna({2: "foo"}) + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) + # column is originally float (all-NaN) -> filling with string gives object dtype + expected[2] = expected[2].astype("object") tm.assert_frame_equal(result, expected) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - return_value = df.fillna({2: "foo"}, inplace=True) - else: - return_value = df.fillna({2: "foo"}, inplace=True) + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -276,8 +272,7 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_fillna_dtype_conversion(self, using_infer_string): + def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.dtypes @@ -292,7 +287,7 @@ def test_fillna_dtype_conversion(self, using_infer_string): # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") result = df.fillna("nan") - expected = DataFrame("nan", index=range(3), columns=["A", "B"]) + expected = DataFrame("nan", dtype="object", index=range(3), columns=["A", "B"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0]) @@ -540,18 +535,10 @@ def test_fillna_col_reordering(self): filled = df.ffill() assert df.columns.tolist() == filled.columns.tolist() - # TODO(infer_string) test as actual error instead of xfail - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") - def test_fill_corner(self, float_frame, float_string_frame): - mf = float_string_frame - mf.loc[mf.index[5:20], "foo"] = np.nan - mf.loc[mf.index[-10:], "A"] = np.nan - - filled = float_string_frame.fillna(value=0) - assert (filled.loc[filled.index[5:20], "foo"] == 0).all() - del float_string_frame["foo"] - - float_frame.reindex(columns=[]).fillna(value=0) + def test_fill_empty(self, float_frame): + df = float_frame.reindex(columns=[]) + result = df.fillna(value=0) + tm.assert_frame_equal(result, df) def test_fillna_with_columns_and_limit(self): # GH40989 diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index aad43b7a77ac7..74e4383950174 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( HAS_PYARROW, IS64, @@ -436,18 +434,25 @@ def test_usage_via_getsizeof(): assert abs(diff) < 100 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_info_memory_usage_qualified(): +def test_info_memory_usage_qualified(using_infer_string): buf = StringIO() df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) df.info(buf=buf) assert "+" not in buf.getvalue() buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=list("ABC")) + df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype=object)) df.info(buf=buf) assert "+" in buf.getvalue() + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype="str")) + df.info(buf=buf) + if using_infer_string and HAS_PYARROW: + assert "+" not in buf.getvalue() + else: + assert "+" in buf.getvalue() + buf = StringIO() df = DataFrame( 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) @@ -460,7 +465,10 @@ def test_info_memory_usage_qualified(): 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) ) df.info(buf=buf) - assert "+" in buf.getvalue() + if using_infer_string and HAS_PYARROW: + assert "+" not in buf.getvalue() + else: + assert "+" in buf.getvalue() def test_info_memory_usage_bug_on_multiindex(): @@ -497,16 +505,15 @@ def test_info_categorical(): df.info(buf=buf) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") -def test_info_int_columns(): +def test_info_int_columns(using_infer_string): # GH#37245 df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) buf = StringIO() df.info(show_counts=True, buf=buf) result = buf.getvalue() expected = textwrap.dedent( - """\ + f"""\ Index: 2 entries, A to B Data columns (total 2 columns): @@ -515,19 +522,22 @@ def test_info_int_columns(): 0 1 2 non-null int64 1 2 2 non-null int64 dtypes: int64(2) - memory usage: 48.0+ bytes + memory usage: {'50.0' if using_infer_string and HAS_PYARROW else '48.0+'} bytes """ ) assert result == expected -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") -def test_memory_usage_empty_no_warning(): +def test_memory_usage_empty_no_warning(using_infer_string): # GH#50066 df = DataFrame(index=["a", "b"]) with tm.assert_produces_warning(None): result = df.memory_usage() - expected = Series(16 if IS64 else 8, index=["Index"]) + if using_infer_string and HAS_PYARROW: + value = 18 + else: + value = 16 if IS64 else 8 + expected = Series(value, index=["Index"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index b8a34d5eaa226..09d1cc9a479b2 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -64,11 +64,7 @@ def test_interpolate_inplace(self, frame_or_series, request): assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 - # TODO(infer_string) raise proper TypeError in case of string dtype - @pytest.mark.xfail( - using_string_dtype(), reason="interpolate doesn't work for string" - ) - def test_interp_basic(self): + def test_interp_basic(self, using_infer_string): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -77,7 +73,8 @@ def test_interp_basic(self): "D": list("abcd"), } ) - msg = "DataFrame cannot interpolate with object dtype" + dtype = "str" if using_infer_string else "object" + msg = f"[Cc]annot interpolate with {dtype} dtype" with pytest.raises(TypeError, match=msg): df.interpolate() @@ -87,8 +84,8 @@ def test_interp_basic(self): df.interpolate(inplace=True) # check we DID operate inplace - assert np.shares_memory(df["C"]._values, cvalues) - assert np.shares_memory(df["D"]._values, dvalues) + assert tm.shares_memory(df["C"]._values, cvalues) + assert tm.shares_memory(df["D"]._values, dvalues) @pytest.mark.xfail( using_string_dtype(), reason="interpolate doesn't work for string" diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 52e871cc795b4..c6e5304ae3cb4 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -159,7 +159,7 @@ def test_nlargest_n_duplicate_index(self, n, order, request): result = df.nlargest(n, order) expected = df.sort_values(order, ascending=False).head(n) if Version(np.__version__) >= Version("1.25") and ( - (order == ["a"] and n in (1, 2, 3, 4)) or (order == ["a", "b"]) and n == 5 + (order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5) ): request.applymarker( pytest.mark.xfail( diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 6b872bf48d550..9e302dc5f94ee 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -30,7 +28,6 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -46,7 +43,9 @@ def test_replace_inplace(self, datetime_frame, float_string_frame): mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, 0) - expected = float_string_frame.fillna(value=0) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=0) tm.assert_frame_equal(result, expected) tsframe = datetime_frame.copy() @@ -291,22 +290,20 @@ def test_regex_replace_dict_nested_non_first_character( expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_dict_nested_gh4115(self): - df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) - expected = DataFrame( - {"Type": Series([0, 1, 0, 0, 1], dtype=df.Type.dtype), "tmp": 2} + df = DataFrame( + {"Type": Series(["Q", "T", "Q", "Q", "T"], dtype=object), "tmp": 2} ) + expected = DataFrame({"Type": Series([0, 1, 0, 0, 1], dtype=object), "tmp": 2}) result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( { "a": mix_abc["a"], - "b": np.array([np.nan] * 4, dtype=object), + "b": Series([np.nan] * 4, dtype="str"), "c": [np.nan, np.nan, np.nan, "d"], } ) @@ -326,7 +323,6 @@ def test_regex_replace_list_to_scalar(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -338,11 +334,11 @@ def test_regex_replace_str_to_numeric(self, mix_abc): return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) assert return_value is None expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]}) + expec["c"] = expec["c"].astype(object) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -535,31 +531,37 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") - def test_replace_convert(self): - # gh 3907 - df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) + def test_replace_convert(self, any_string_dtype): + # gh 3907 (pandas >= 3.0 no longer converts dtypes) + df = DataFrame( + [["foo", "bar", "bah"], ["bar", "foo", "bah"]], dtype=any_string_dtype + ) m = {"foo": 1, "bar": 2, "bah": 3} rep = df.replace(m) - expec = df.dtypes - res = rep.dtypes - tm.assert_series_equal(expec, res) + assert (rep.dtypes == object).all() - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, -18) - expected = float_string_frame.fillna(value=-18) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-18) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-18, np.nan), expected2) result = float_string_frame.replace(np.nan, -1e8) - expected = float_string_frame.fillna(value=-1e8) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-1e8) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-1e8, np.nan), expected2) def test_replace_mixed_int_block_upcasting(self): # int block upcasting @@ -601,8 +603,7 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_replace_mixed2(self, using_infer_string): + def test_replace_mixed2(self): # to object block upcasting df = DataFrame( { @@ -621,7 +622,7 @@ def test_replace_mixed2(self, using_infer_string): expected = DataFrame( { - "A": Series(["foo", "bar"]), + "A": Series(["foo", "bar"], dtype="object"), "B": Series([0, "foo"], dtype="object"), } ) @@ -711,6 +712,13 @@ def test_replace_with_None_keeps_categorical(self): ) tm.assert_frame_equal(result, expected) + def test_replace_all_NA(self): + # GH#60688 + df = DataFrame({"ticker": ["#1234#"], "name": [None]}) + result = df.replace({col: {r"^#": "$"} for col in df.columns}, regex=True) + expected = DataFrame({"ticker": ["$1234#"], "name": [None]}) + tm.assert_frame_equal(result, expected) + def test_replace_value_is_none(self, datetime_frame): orig_value = datetime_frame.iloc[0, 0] orig2 = datetime_frame.iloc[1, 0] @@ -889,7 +897,6 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -918,8 +925,7 @@ def test_replace_limit(self): # TODO pass - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") - def test_replace_dict_no_regex(self): + def test_replace_dict_no_regex(self, any_string_dtype): answer = Series( { 0: "Strongly Agree", @@ -927,7 +933,8 @@ def test_replace_dict_no_regex(self): 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", - } + }, + dtype=any_string_dtype, ) weights = { "Agree": 4, @@ -936,12 +943,11 @@ def test_replace_dict_no_regex(self): "Strongly Agree": 5, "Strongly Disagree": 1, } - expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}, dtype=answer.dtype) + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}, dtype=object) result = answer.replace(weights) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") - def test_replace_series_no_regex(self): + def test_replace_series_no_regex(self, any_string_dtype): answer = Series( { 0: "Strongly Agree", @@ -949,7 +955,8 @@ def test_replace_series_no_regex(self): 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", - } + }, + dtype=any_string_dtype, ) weights = Series( { @@ -1045,16 +1052,15 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") - def test_replace_swapping_bug(self, using_infer_string): + def test_replace_swapping_bug(self): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) - expect = DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) tm.assert_frame_equal(res, expect) df = DataFrame({"a": [0, 1, 0]}) res = df.replace({"a": {0: "Y", 1: "N"}}) - expect = DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) tm.assert_frame_equal(res, expect) def test_replace_datetimetz(self): @@ -1176,7 +1182,6 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize( "replacer", [ @@ -1189,7 +1194,7 @@ def test_replace_commutative(self, df, to_replace, exp): ) def test_replace_replacer_dtype(self, replacer): # GH26632 - df = DataFrame(["a"]) + df = DataFrame(["a"], dtype=object) result = df.replace({"a": replacer, "b": replacer}) expected = DataFrame([replacer], dtype=object) tm.assert_frame_equal(result, expected) @@ -1269,7 +1274,6 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data): assert return_value is None tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1325,7 +1329,7 @@ def test_replace_value_category_type(self): lambda x: x.astype("category").cat.rename_categories({"cat2": "catX"}) ) - result = result.astype({"col1": "int64", "col3": "float64", "col5": "object"}) + result = result.astype({"col1": "int64", "col3": "float64", "col5": "str"}) tm.assert_frame_equal(result, expected) def test_replace_dict_category_type(self): @@ -1366,12 +1370,11 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_intervals(self): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) result = df.replace({"a": {pd.Interval(0, 1): "x"}}) - expected = DataFrame({"a": ["x", "x"]}) + expected = DataFrame({"a": ["x", "x"]}, dtype=object) tm.assert_frame_equal(result, expected) def test_replace_unicode(self): @@ -1471,18 +1474,25 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize("regex", [False, True]) - def test_replace_regex_dtype_frame(self, regex): + @pytest.mark.parametrize("value", [1, "1"]) + def test_replace_regex_dtype_frame(self, regex, value): # GH-48644 df1 = DataFrame({"A": ["0"], "B": ["0"]}) - expected_df1 = DataFrame({"A": [1], "B": [1]}, dtype=df1.dtypes.iloc[0]) - result_df1 = df1.replace(to_replace="0", value=1, regex=regex) + # When value is an integer, coerce result to object. + # When value is a string, infer the correct string dtype. + dtype = object if value == 1 else None + + expected_df1 = DataFrame({"A": [value], "B": [value]}, dtype=dtype) + result_df1 = df1.replace(to_replace="0", value=value, regex=regex) tm.assert_frame_equal(result_df1, expected_df1) df2 = DataFrame({"A": ["0"], "B": ["1"]}) - expected_df2 = DataFrame({"A": [1], "B": ["1"]}, dtype=df2.dtypes.iloc[0]) - result_df2 = df2.replace(to_replace="0", value=1, regex=regex) + if regex: + expected_df2 = DataFrame({"A": [value], "B": ["1"]}, dtype=dtype) + else: + expected_df2 = DataFrame({"A": Series([value], dtype=dtype), "B": ["1"]}) + result_df2 = df2.replace(to_replace="0", value=value, regex=regex) tm.assert_frame_equal(result_df2, expected_df2) def test_replace_with_value_also_being_replaced(self): diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 88e43b678a7e4..0b320075ed2d2 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -644,7 +644,7 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes): tm.assert_frame_equal(res, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) - GH#60338") @pytest.mark.parametrize( "array, dtype", [ diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 4e490e9e344ba..b52240c208493 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -747,3 +747,22 @@ def test_shift_axis_one_empty(self): df = DataFrame() result = df.shift(1, axis=1) tm.assert_frame_equal(result, df) + + def test_shift_with_offsets_freq_empty(self): + # GH#60102 + dates = date_range("2020-01-01", periods=3, freq="D") + offset = offsets.Day() + shifted_dates = dates + offset + df = DataFrame(index=dates) + df_shifted = DataFrame(index=shifted_dates) + result = df.shift(freq=offset) + tm.assert_frame_equal(result, df_shifted) + + def test_series_shift_interval_preserves_closed(self): + # GH#60389 + ser = Series( + [pd.Interval(1, 2, closed="right"), pd.Interval(2, 3, closed="right")] + ) + result = ser.shift(1) + expected = Series([np.nan, pd.Interval(1, 2, closed="right")]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index adb327e90bb76..9eafc69013ffe 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserError import pandas as pd @@ -44,7 +42,6 @@ def test_to_csv_from_csv1(self, temp_file, float_frame): float_frame.to_csv(path, header=False) float_frame.to_csv(path, index=False) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_from_csv1_datetime(self, temp_file, datetime_frame): path = str(temp_file) # test roundtrip @@ -439,20 +436,18 @@ def test_to_csv_empty(self): result, expected = self._return_result_expected(df, 1000) tm.assert_frame_equal(result, expected, check_column_type=False) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.slow def test_to_csv_chunksize(self): chunksize = 1000 rows = chunksize // 2 + 1 df = DataFrame( np.ones((rows, 2)), - columns=Index(list("ab"), dtype=object), + columns=Index(list("ab")), index=MultiIndex.from_arrays([range(rows) for _ in range(2)]), ) result, expected = self._return_result_expected(df, chunksize, rnlvl=2) tm.assert_frame_equal(result, expected, check_names=False) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.slow @pytest.mark.parametrize( "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] @@ -481,7 +476,7 @@ def test_to_csv_params(self, nrows, df_params, func_params, ncols): for _ in range(df_params["c_idx_nlevels"]) ) else: - columns = Index([f"i-{i}" for i in range(ncols)], dtype=object) + columns = Index([f"i-{i}" for i in range(ncols)]) df = DataFrame(np.ones((nrows, ncols)), index=index, columns=columns) result, expected = self._return_result_expected(df, 1000, **func_params) tm.assert_frame_equal(result, expected, check_names=False) @@ -549,7 +544,6 @@ def test_to_csv_headers(self, temp_file): assert return_value is None tm.assert_frame_equal(to_df, recons) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_multiindex(self, temp_file, float_frame, datetime_frame): frame = float_frame old_index = frame.index @@ -740,7 +734,6 @@ def test_to_csv_withcommas(self, temp_file): df2 = self.read_csv(path) tm.assert_frame_equal(df2, df) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_mixed(self, temp_file): def create_cols(name): return [f"{name}{i:03d}" for i in range(5)] @@ -757,7 +750,7 @@ def create_cols(name): ) df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) df_object = DataFrame( - "foo", index=df_float.index, columns=create_cols("object") + "foo", index=df_float.index, columns=create_cols("object"), dtype="object" ) df_dt = DataFrame( Timestamp("20010101"), @@ -826,13 +819,12 @@ def test_to_csv_dups_cols(self, temp_file): result.columns = df.columns tm.assert_frame_equal(result, df) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_dups_cols2(self, temp_file): # GH3457 df = DataFrame( np.ones((5, 3)), index=Index([f"i-{i}" for i in range(5)], name="foo"), - columns=Index(["a", "a", "b"], dtype=object), + columns=Index(["a", "a", "b"]), ) path = str(temp_file) diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index d38bc06260a0e..36088cceb13f1 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( DataFrame, @@ -31,6 +32,9 @@ def test_to_numpy_copy(self): # and that can be respected because we are already numpy-float assert df.to_numpy(copy=False).base is df.values.base + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]]) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index de5029b9f18b2..43db234267f21 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd import pandas._testing as tm @@ -136,9 +132,6 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) def test_data_frame_value_counts_dropna_false(nulls_fixture): # GH 41334 df = pd.DataFrame( diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 3fb994f2e0aff..2b0bf1b0576f9 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -376,6 +376,5 @@ def test_constructor_expanddim(self): def test_inspect_getmembers(self): # GH38740 - pytest.importorskip("jinja2") df = DataFrame() inspect.getmembers(df) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index e41a3b27e592c..aa2d5e9d23815 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import HAS_PYARROW import pandas as pd @@ -1544,9 +1542,6 @@ def test_comparisons(self, simple_frame, float_frame, func): with pytest.raises(ValueError, match=msg): func(simple_frame, simple_frame[:2]) - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" - ) def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): # GH 11565 df = DataFrame( @@ -1554,7 +1549,12 @@ def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne) ) f = getattr(operator, compare_operators_no_eq_ne) - msg = "'[<>]=?' not supported between instances of 'str' and 'int'" + msg = "|".join( + [ + "'[<>]=?' not supported between instances of 'str' and 'int'", + "Invalid comparison between dtype=str and int", + ] + ) with pytest.raises(TypeError, match=msg): f(df, 0) @@ -2033,6 +2033,31 @@ def test_arithmetic_multiindex_align(): tm.assert_frame_equal(result, expected) +def test_arithmetic_multiindex_column_align(): + # GH#60498 + df1 = DataFrame( + data=100, + columns=MultiIndex.from_product( + [["1A", "1B"], ["2A", "2B"]], names=["Lev1", "Lev2"] + ), + index=["C1", "C2"], + ) + df2 = DataFrame( + data=np.array([[0.1, 0.25], [0.2, 0.45]]), + columns=MultiIndex.from_product([["1A", "1B"]], names=["Lev1"]), + index=["C1", "C2"], + ) + expected = DataFrame( + data=np.array([[10.0, 10.0, 25.0, 25.0], [20.0, 20.0, 45.0, 45.0]]), + columns=MultiIndex.from_product( + [["1A", "1B"], ["2A", "2B"]], names=["Lev1", "Lev2"] + ), + index=["C1", "C2"], + ) + result = df1 * df2 + tm.assert_frame_equal(result, expected) + + def test_bool_frame_mult_float(): # GH 18549 df = DataFrame(True, list("ab"), list("cd")) @@ -2101,12 +2126,19 @@ def test_enum_column_equality(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_mixed_col_index_dtype(): +def test_mixed_col_index_dtype(using_infer_string): # GH 47382 df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) df2 = DataFrame(columns=list("abc"), data=0.0, index=[0]) df1.columns = df2.columns.astype("string") result = df1 + df2 expected = DataFrame(columns=list("abc"), data=1.0, index=[0]) + if using_infer_string: + # df2.columns.dtype will be "str" instead of object, + # so the aligned result will be "string", not object + if HAS_PYARROW: + dtype = "string[pyarrow]" + else: + dtype = "string" + expected.columns = expected.columns.astype(dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index dc163268f64b9..b36b6b5ffe0cc 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -2,8 +2,6 @@ import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -11,9 +9,8 @@ pa = pytest.importorskip("pyarrow") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="14.0") -def test_dataframe_arrow_interface(): +def test_dataframe_arrow_interface(using_infer_string): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) capsule = df.__arrow_c_stream__() @@ -25,7 +22,8 @@ def test_dataframe_arrow_interface(): ) table = pa.table(df) - expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + string_type = pa.large_string() if using_infer_string else pa.string() + expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)}) assert table.equals(expected) schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) @@ -34,13 +32,13 @@ def test_dataframe_arrow_interface(): assert table.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="15.0") -def test_dataframe_to_arrow(): +def test_dataframe_to_arrow(using_infer_string): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) table = pa.RecordBatchReader.from_stream(df).read_all() - expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + string_type = pa.large_string() if using_infer_string else pa.string() + expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)}) assert table.equals(expected) schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 25e66a0e1c03d..6fdbfac8f4e0a 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -162,21 +160,7 @@ def test_constructor_with_convert(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_mixed(self, float_string_frame, using_infer_string): - # test construction edge cases with mixed types - - # f7u12, this does not work without extensive workaround - data = [ - [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], - [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], - ] - df = DataFrame(data) - - # check dtypes - result = df.dtypes - expected = Series({"datetime64[us]": 3}) - # mixed-type frames float_string_frame["datetime"] = datetime.now() float_string_frame["timedelta"] = timedelta(days=1, seconds=1) @@ -196,13 +180,11 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_conversions(self): # convert from a numpy array of non-ns timedelta64; as of 2.0 this does # *not* convert arr = np.array([1, 2, 3], dtype="timedelta64[s]") - df = DataFrame(index=range(3)) - df["A"] = arr + df = DataFrame({"A": arr}) expected = DataFrame( {"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3) ) @@ -220,11 +202,11 @@ def test_construction_with_conversions(self): assert expected.dtypes["dt1"] == "M8[s]" assert expected.dtypes["dt2"] == "M8[s]" - df = DataFrame(index=range(3)) - df["dt1"] = np.datetime64("2013-01-01") - df["dt2"] = np.array( + dt1 = np.datetime64("2013-01-01") + dt2 = np.array( ["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]" ) + df = DataFrame({"dt1": dt1, "dt2": dt2}) # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]') @@ -401,14 +383,17 @@ def test_update_inplace_sets_valid_block_values(): assert isinstance(df._mgr.blocks[0].values, Categorical) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nonconsolidated_item_cache_take(): # https://github.com/pandas-dev/pandas/issues/35521 # create non-consolidated dataframe with object dtype columns - df = DataFrame() - df["col1"] = Series(["a"], dtype=object) + df = DataFrame( + { + "col1": Series(["a"], dtype=object), + } + ) df["col2"] = Series([0], dtype=object) + assert not df._mgr.is_consolidated() # access column (item cache) df["col1"] == "A" diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 0a924aa393be5..9b6080603f0c9 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2404,6 +2404,9 @@ def test_construct_with_two_categoricalindex_series(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_constructor_series_nonexact_categoricalindex(self): # GH 42424 ser = Series(range(100)) @@ -2772,6 +2775,14 @@ def test_construction_datetime_resolution_inference(self, cons): res_dtype2 = tm.get_dtype(obj2) assert res_dtype2 == "M8[us, US/Pacific]", res_dtype2 + def test_construction_nan_value_timedelta64_dtype(self): + # GH#60064 + result = DataFrame([None, 1], dtype="timedelta64[ns]") + expected = DataFrame( + ["NaT", "0 days 00:00:00.000000001"], dtype="timedelta64[ns]" + ) + tm.assert_frame_equal(result, expected) + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index a574989860957..ca572b1026526 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -159,6 +159,25 @@ def test_query_empty_string(self): with pytest.raises(ValueError, match=msg): df.query("") + def test_query_duplicate_column_name(self, engine, parser): + df = DataFrame( + { + "A": range(3), + "B": range(3), + "C": range(3) + } + ).rename(columns={"B": "A"}) + + res = df.query('C == 1', engine=engine, parser=parser) + + expect = DataFrame( + [[1, 1, 1]], + columns=["A", "A", "C"], + index=[1] + ) + + tm.assert_frame_equal(res, expect) + def test_eval_resolvers_as_list(self): # GH 14095 df = DataFrame( diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 05bb603f5c462..04b1456cdbea6 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( IS64, is_platform_windows, @@ -674,23 +672,10 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - def test_mode_sortwarning(self, using_infer_string): - # Check for the warning that is raised when the mode - # results cannot be sorted - + def test_mode_sort_with_na(self, using_infer_string): df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - - # TODO(infer_string) avoid this UserWarning for python storage - warning = ( - None - if using_infer_string and df.A.dtype.storage == "pyarrow" - else UserWarning - ) - with tm.assert_produces_warning(warning, match="Unable to sort modes"): - result = df.mode(dropna=False) - result = result.sort_values(by="A").reset_index(drop=True) - + result = df.mode(dropna=False) tm.assert_frame_equal(result, expected) def test_mode_empty_df(self): @@ -1047,7 +1032,6 @@ def test_sum_bools(self): # ---------------------------------------------------------------------- # Index of max / min - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("axis", [0, 1]) def test_idxmin(self, float_frame, int_frame, skipna, axis): frame = float_frame @@ -1082,7 +1066,6 @@ def test_idxmin_empty(self, index, skipna, axis): expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmin_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) @@ -1099,7 +1082,6 @@ def test_idxmin_axis_2(self, float_frame): with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax(self, float_frame, int_frame, skipna, axis): frame = float_frame @@ -1133,7 +1115,6 @@ def test_idxmax_empty(self, index, skipna, axis): expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmax_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index 10cc86385af1b..73628424725e5 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( NA, Categorical, @@ -176,7 +174,6 @@ def test_repr_mixed_big(self): repr(biggie) - @pytest.mark.xfail(using_string_dtype(), reason="/r in") def test_repr(self): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index b4f02b6f81b6f..abc14d10514fa 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib import pandas as pd @@ -1675,7 +1673,6 @@ def test_unstack_multiple_no_empty_columns(self): expected = unstacked.dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) @@ -1858,10 +1855,7 @@ def test_unstack_bug(self, future_stack): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) - + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack(future_stack=future_stack) tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) @@ -1926,7 +1920,6 @@ def test_stack_level_name(self, multiindex_dataframe_random_data, future_stack): expected = frame.stack(future_stack=future_stack) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) @@ -2113,7 +2106,7 @@ def test_unstack_period_frame(self): @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) - def test_stack_multiple_bug(self, future_stack): + def test_stack_multiple_bug(self, future_stack, using_infer_string): # bug when some uniques are not present in the data GH#3170 id_col = ([1] * 3) + ([2] * 3) name = (["a"] * 3) + (["b"] * 3) @@ -2125,6 +2118,8 @@ def test_stack_multiple_bug(self, future_stack): multi.columns.name = "Params" unst = multi.unstack("ID") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 7d18ef28a722d..cbd563a03b908 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -769,6 +769,13 @@ def test_constructor_with_metadata(): assert isinstance(subset, MySubclassWithMetadata) +def test_constructor_with_metadata_from_records(): + # GH#57008 + df = MySubclassWithMetadata.from_records([{"a": 1, "b": 2}]) + assert df.my_metadata is None + assert type(df) is MySubclassWithMetadata + + class SimpleDataFrameSubClass(DataFrame): """A subclass of DataFrame that does not define a constructor.""" diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 217255e73b450..652f52bd226af 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -3,9 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gte1p25 import pandas as pd @@ -122,9 +119,6 @@ def test_pos_object(self, df_data): tm.assert_frame_equal(+df, df) tm.assert_series_equal(+df["a"], df["a"]) - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" - ) @pytest.mark.filterwarnings("ignore:Applying:DeprecationWarning") def test_pos_object_raises(self): # GH#21380 diff --git a/pandas/tests/groupby/__init__.py b/pandas/tests/groupby/__init__.py index 446d9da437771..79046cd7ed415 100644 --- a/pandas/tests/groupby/__init__.py +++ b/pandas/tests/groupby/__init__.py @@ -2,7 +2,7 @@ def get_groupby_method_args(name, obj): """ Get required arguments for a groupby method. - When parametrizing a test over groupby methods (e.g. "sum", "mean", "fillna"), + When parametrizing a test over groupby methods (e.g. "sum", "mean"), it is often the case that arguments are required for certain methods. Parameters @@ -16,7 +16,7 @@ def get_groupby_method_args(name, obj): ------- A tuple of required arguments for the method. """ - if name in ("nth", "fillna", "take"): + if name in ("nth", "take"): return (0,) if name == "quantile": return (0.5,) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 46c27849356b5..b7e6e55739c17 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError from pandas.core.dtypes.common import is_integer_dtype @@ -161,6 +159,7 @@ def test_agg_apply_corner(ts, tsframe): tm.assert_frame_equal(grouped.agg("sum"), exp_df) res = grouped.apply(np.sum, axis=0) + exp_df = exp_df.reset_index(drop=True) tm.assert_frame_equal(res, exp_df) @@ -296,12 +295,11 @@ def aggfun_1(ser): assert len(result) == 0 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wrap_agg_out(three_group): grouped = three_group.groupby(["A", "B"]) def func(ser): - if ser.dtype == object: + if ser.dtype == object or ser.dtype == "string": raise TypeError("Test error message") return ser.sum() @@ -1117,7 +1115,6 @@ def test_lambda_named_agg(func): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_aggregate_mixed_types(): # GH 16916 df = DataFrame( @@ -1129,7 +1126,7 @@ def test_aggregate_mixed_types(): expected = DataFrame( expected_data, index=Index([2, "group 1"], dtype="object", name="grouping"), - columns=Index(["X", "Y", "Z"], dtype="object"), + columns=Index(["X", "Y", "Z"]), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index d28eb227314c7..a706ea795a0e2 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -92,7 +90,6 @@ def test_cython_agg_boolean(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_agg_nothing_to_agg(): frame = DataFrame( {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} @@ -108,7 +105,9 @@ def test_cython_agg_nothing_to_agg(): result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True) expected = DataFrame( - [], index=frame["a"].sort_values().drop_duplicates(), columns=[] + [], + index=frame["a"].sort_values().drop_duplicates(), + columns=Index([], dtype="str"), ) tm.assert_frame_equal(result, expected) @@ -148,11 +147,11 @@ def test_cython_agg_return_dict(): def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) - ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) + ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr) grouped = ts.groupby(lambda x: x.month) summed = grouped.sum() - expected = grouped.agg(np.sum) + expected = grouped.agg(np.sum).astype(object) tm.assert_series_equal(summed, expected) diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index 15c1efe5fd1ff..0cd8a14d97eb0 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -186,6 +186,23 @@ def test_multifunc_numba_vs_cython_frame(agg_kwargs): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("func", ["sum", "mean", "var", "std", "min", "max"]) +def test_multifunc_numba_vs_cython_frame_noskipna(func): + pytest.importorskip("numba") + data = DataFrame( + { + 0: ["a", "a", "b", "b", "a"], + 1: [1.0, np.nan, 3.0, 4.0, 5.0], + 2: [1, 2, 3, 4, 5], + }, + columns=[0, 1, 2], + ) + grouped = data.groupby(0) + result = grouped.agg(func, skipna=False, engine="numba") + expected = grouped.agg(func, skipna=False, engine="cython") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "agg_kwargs,expected_func", [ diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 835cad0d13078..1c016143d50c3 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError import pandas as pd @@ -308,7 +306,6 @@ def test_series_agg_multikey(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_series_agg_multi_pure_python(): data = DataFrame( { @@ -358,7 +355,8 @@ def test_series_agg_multi_pure_python(): ) def bad(x): - assert len(x.values.base) > 0 + if isinstance(x.values, np.ndarray): + assert len(x.values.base) > 0 return "foo" result = data.groupby(["A", "B"]).agg(bad) @@ -501,17 +499,13 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] + assert ts == grouped.apply(lambda x: x.iloc[0])["B"].iloc[0] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] + assert ts == grouped.apply(lambda x: x.iloc[-1])["B"].iloc[0] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/methods/test_kurt.py b/pandas/tests/groupby/methods/test_kurt.py new file mode 100644 index 0000000000000..21b7c50c3c5aa --- /dev/null +++ b/pandas/tests/groupby/methods/test_kurt.py @@ -0,0 +1,90 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + + +def test_groupby_kurt_equivalence(): + # GH#40139 + # Test that that groupby kurt method (which uses libgroupby.group_kurt) + # matches the results of operating group-by-group (which uses nanops.nankurt) + nrows = 1000 + ngroups = 3 + ncols = 2 + nan_frac = 0.05 + + arr = np.random.default_rng(2).standard_normal((nrows, ncols)) + arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan + + df = pd.DataFrame(arr) + grps = np.random.default_rng(2).integers(0, ngroups, size=nrows) + gb = df.groupby(grps) + + result = gb.kurt() + + grpwise = [grp.kurt().to_frame(i).T for i, grp in gb] + expected = pd.concat(grpwise, axis=0) + expected.index = expected.index.astype("int64") # 32bit builds + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + "Float64", + ], +) +def test_groupby_kurt_arrow_float64(dtype): + # GH#40139 + # Test groupby.kurt() with float64[pyarrow] and Float64 dtypes + df = pd.DataFrame( + { + "x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9], + "y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0], + }, + dtype=dtype, + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt() + expected = pd.DataFrame({"x": [2.1644713], "y": [0.1513969]}, dtype=dtype) + tm.assert_almost_equal(result, expected) + + +def test_groupby_kurt_noskipna(): + # GH#40139 + # Test groupby.kurt() with skipna = False + df = pd.DataFrame( + { + "x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9], + "y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0], + } + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt(skipna=False) + expected = pd.DataFrame({"x": [np.nan], "y": [0.1513969]}) + tm.assert_almost_equal(result, expected) + + +def test_groupby_kurt_all_ones(): + # GH#40139 + # Test groupby.kurt() with constant values + df = pd.DataFrame( + { + "x": [1.0] * 10, + } + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt(skipna=False) + expected = pd.DataFrame( + { + "x": [0.0], # Same behavior as pd.DataFrame.kurt() + } + ) + tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 0e31c0698cb1e..28cb25b515ed2 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -158,11 +156,11 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) - with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): + msg = "dtype '(object|str)' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): df.groupby("key").quantile() @@ -241,7 +239,6 @@ def test_groupby_quantile_nullable_array(values, q): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): @@ -251,9 +248,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): expected = df.groupby("a")[["b"]].quantile(q) tm.assert_frame_equal(result, expected) else: - with pytest.raises( - TypeError, match="'quantile' cannot be performed against 'object' dtypes!" - ): + msg = "dtype '.*' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): df.groupby("a").quantile(q, numeric_only=numeric_only) diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 91200f53e36bd..6664563bd2272 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -76,16 +74,16 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_size_strings(any_string_dtype): +def test_size_strings(any_string_dtype, using_infer_string): # GH#55627 dtype = any_string_dtype df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) result = df.groupby("a")["b"].size() exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" + exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype expected = Series( [2, 1], - index=Index(["a", "b"], name="a", dtype=dtype), + index=Index(["a", "b"], name="a", dtype=exp_index_dtype), name="b", dtype=exp_dtype, ) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 8ca6593a19f20..1050f8154572a 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -324,12 +324,9 @@ def test_against_frame_and_seriesgroupby( ) if frame: # compare against apply with DataFrame value_counts - warn = DeprecationWarning if groupby == "column" else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - expected = gp.apply( - _frame_value_counts, ["gender", "education"], normalize, sort, ascending - ) + expected = gp.apply( + _frame_value_counts, ["gender", "education"], normalize, sort, ascending + ) if as_index: tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py index 945c3e421a132..4625c5c27a803 100644 --- a/pandas/tests/groupby/test_all_methods.py +++ b/pandas/tests/groupby/test_all_methods.py @@ -22,7 +22,7 @@ def test_multiindex_group_all_columns_when_empty(groupby_func): # GH 32464 df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"]) - gb = df.groupby(["a", "b", "c"], group_keys=False) + gb = df.groupby(["a", "b", "c"], group_keys=True) method = getattr(gb, groupby_func) args = get_groupby_method_args(groupby_func, df) if groupby_func == "corrwith": diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index 013b308cd14cd..215e627abb018 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -74,6 +74,7 @@ def test_tab_completion(multiindex_dataframe_random_data): "all", "shift", "skew", + "kurt", "take", "pct_change", "any", @@ -173,13 +174,13 @@ def test_frame_consistency(groupby_func): elif groupby_func in ("nunique",): exclude_expected = {"axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("mean", "std", "sum", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} + elif groupby_func in ("sum", "mean", "std", "var"): + exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} elif groupby_func in ("bfill", "ffill"): exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): @@ -231,13 +232,13 @@ def test_series_consistency(request, groupby_func): if groupby_func in ("any", "all"): exclude_expected = {"kwargs", "bool_only", "axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("mean", "std", "sum", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} + elif groupby_func in ("sum", "mean", "std", "var"): + exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} elif groupby_func in ("bfill", "ffill"): exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 1a4127ab49b0e..294ab14c96de8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -27,12 +27,9 @@ def test_apply_func_that_appends_group_to_list_without_copy(): def store(group): groups.append(group) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("index").apply(store) - expected_value = DataFrame( - {"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10) - ) + df.groupby("index").apply(store) + expected_value = DataFrame({0: [1] * 10}, index=pd.RangeIndex(0, 100, 10)) + expected_value.columns = expected_value.columns.astype(object) tm.assert_frame_equal(groups[0], expected_value) @@ -111,11 +108,7 @@ def test_apply_index_date_object(): ] exp_idx = Index(["2011-05-16", "2011-05-17", "2011-05-18"], name="date") expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("date", group_keys=False).apply( - lambda x: x["time"][x["value"].idxmax()] - ) + result = df.groupby("date").apply(lambda x: x["time"][x["value"].idxmax()]) tm.assert_series_equal(result, expected) @@ -189,9 +182,7 @@ def f_constant_df(group): for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: del names[:] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("a", group_keys=False).apply(func) + df.groupby("a").apply(func) assert names == group_names @@ -209,11 +200,9 @@ def test_group_apply_once_per_group2(capsys): index=["0", "2", "4", "6", "8", "10", "12", "14"], ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("group_by_column", group_keys=False).apply( - lambda df: print("function_called") - ) + df.groupby("group_by_column", group_keys=False).apply( + lambda df: print("function_called") + ) result = capsys.readouterr().out.count("function_called") # If `groupby` behaves unexpectedly, this test will break @@ -233,15 +222,27 @@ def slow(group): def fast(group): return group.copy() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - fast_df = df.groupby("A", group_keys=False).apply(fast) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - slow_df = df.groupby("A", group_keys=False).apply(slow) - + fast_df = df.groupby("A", group_keys=False).apply(fast) + slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) +def test_apply_fast_slow_identical_index(): + # GH#44803 + df = DataFrame( + { + "name": ["Alice", "Bob", "Carl"], + "age": [20, 21, 20], + } + ).set_index("name") + + grp_by_same_value = df.groupby(["age"], group_keys=False).apply(lambda group: group) + grp_by_copy = df.groupby(["age"], group_keys=False).apply( + lambda group: group.copy() + ) + tm.assert_frame_equal(grp_by_same_value, grp_by_copy) + + @pytest.mark.parametrize( "func", [ @@ -258,11 +259,8 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): # transparent to the user df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("g", group_keys=False).apply(func) - tm.assert_frame_equal(result, df) + result = df.groupby("g", group_keys=False).apply(func) + tm.assert_frame_equal(result, df[["a", "b"]]) def test_apply_with_mixed_dtype(): @@ -273,19 +271,19 @@ def test_apply_with_mixed_dtype(): "foo2": ["one", "two", "two", "three", "one", "two"], } ) - result = df.apply(lambda x: x, axis=1).dtypes - expected = df.dtypes - tm.assert_series_equal(result, expected) + result = df.apply(lambda x: x, axis=1) + expected = df + tm.assert_frame_equal(result, expected) # GH 3610 incorrect dtype conversion with as_index=False df = DataFrame({"c1": [1, 2, 6, 6, 8]}) df["c2"] = df.c1 / 2.0 - result1 = df.groupby("c2").mean().reset_index().c2 - result2 = df.groupby("c2", as_index=False).mean().c2 - tm.assert_series_equal(result1, result2) + result1 = df.groupby("c2").mean().reset_index() + result2 = df.groupby("c2", as_index=False).mean() + tm.assert_frame_equal(result1, result2) -def test_groupby_as_index_apply(): +def test_groupby_as_index_apply(as_index): # GH #4648 and #3417 df = DataFrame( { @@ -294,38 +292,41 @@ def test_groupby_as_index_apply(): "time": range(6), } ) + gb = df.groupby("user_id", as_index=as_index) - g_as = df.groupby("user_id", as_index=True) - g_not_as = df.groupby("user_id", as_index=False) - - res_as = g_as.head(2).index - res_not_as = g_not_as.head(2).index - exp = Index([0, 1, 2, 4]) - tm.assert_index_equal(res_as, exp) - tm.assert_index_equal(res_not_as, exp) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res_as_apply = g_as.apply(lambda x: x.head(2)).index - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + expected = DataFrame( + { + "item_id": ["b", "b", "a", "a"], + "user_id": [1, 2, 1, 3], + "time": [0, 1, 2, 4], + }, + index=[0, 1, 2, 4], + ) + result = gb.head(2) + tm.assert_frame_equal(result, expected) # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here - exp_not_as_apply = Index([0, 2, 1, 4]) - tp = [(1, 0), (1, 2), (2, 1), (3, 4)] - exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None]) - - tm.assert_index_equal(res_as_apply, exp_as_apply) - tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) + if as_index: + tp = [(1, 0), (1, 2), (2, 1), (3, 4)] + index = MultiIndex.from_tuples(tp, names=["user_id", None]) + else: + index = Index([0, 2, 1, 4]) + expected = DataFrame( + { + "item_id": list("baba"), + "time": [0, 2, 1, 4], + }, + index=index, + ) + result = gb.apply(lambda x: x.head(2)) + tm.assert_frame_equal(result, expected) def test_groupby_as_index_apply_str(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index + res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -354,19 +355,13 @@ def desc3(group): # weirdo return result - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(desc) + result = grouped.apply(desc) assert result.index.names == ("A", "B", "stat") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = grouped.apply(desc2) + result2 = grouped.apply(desc2) assert result2.index.names == ("A", "B", "stat") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result3 = grouped.apply(desc3) + result3 = grouped.apply(desc3) assert result3.index.names == ("A", "B", None) @@ -396,9 +391,7 @@ def test_apply_series_yield_constant(df): def test_apply_frame_yield_constant(df): # GH13568 - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["A", "B"]).apply(len) + result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None @@ -409,9 +402,7 @@ def test_apply_frame_yield_constant(df): def test_apply_frame_to_series(df): grouped = df.groupby(["A", "B"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(len) + result = grouped.apply(len) expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) tm.assert_numpy_array_equal(result.values, expected.values) @@ -420,9 +411,7 @@ def test_apply_frame_to_series(df): def test_apply_frame_not_as_index_column_name(df): # GH 35964 - path within _wrap_applied_output not hit by a test grouped = df.groupby(["A", "B"], as_index=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(len) + result = grouped.apply(len) expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan tm.assert_index_equal(result.index, expected.index) @@ -445,9 +434,7 @@ def trans2(group): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(trans) + result = df.groupby("A").apply(trans) exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) assert result.name == "C" @@ -476,10 +463,8 @@ def test_apply_chunk_view(group_keys): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) - expected = df.take([0, 1, 3, 4, 6, 7]) + result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) + expected = df[["value"]].take([0, 1, 3, 4, 6, 7]) if group_keys: expected.index = MultiIndex.from_arrays( [[1, 1, 2, 2, 3, 3], expected.index], names=["key", None] @@ -499,9 +484,7 @@ def test_apply_no_name_column_conflict(): # it works! #2605 grouped = df.groupby(["name", "name2"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grouped.apply(lambda x: x.sort_values("value", inplace=True)) + grouped.apply(lambda x: x.sort_values("value", inplace=True)) def test_apply_typecast_fail(): @@ -518,11 +501,9 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("d", group_keys=False).apply(f) + result = df.groupby("d", group_keys=False).apply(f) - expected = df.copy() + expected = df[["c", "v"]] expected["v2"] = np.tile([0.0, 0.5, 1], 2) tm.assert_frame_equal(result, expected) @@ -544,13 +525,10 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("d", group_keys=False).apply(f) + result = df.groupby("d", group_keys=False).apply(f) - expected = df.copy() + expected = df[["c", "v"]] expected["v2"] = np.tile([0.0, 0.5, 1], 2) - tm.assert_frame_equal(result, expected) @@ -584,11 +562,8 @@ def filt2(x): else: return x[x.category == "c"] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = data.groupby("id_field").apply(filt1) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = data.groupby("id_field").apply(filt2) + expected = data.groupby("id_field").apply(filt1) + result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -601,19 +576,11 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): if test_series: ser = df.set_index("Y")["X"] result = ser.groupby(level=0, group_keys=False).apply(lambda x: x) - - # not expecting the order to remain the same for duplicated axis - result = result.sort_index() - expected = ser.sort_index() + expected = ser tm.assert_series_equal(result, expected) else: - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("Y", group_keys=False).apply(lambda x: x) - - # not expecting the order to remain the same for duplicated axis - result = result.sort_values("Y") - expected = df.sort_values("Y") + result = df.groupby("Y", group_keys=False).apply(lambda x: x) + expected = df[["X"]] tm.assert_frame_equal(result, expected) @@ -654,9 +621,7 @@ def f(g): g["value3"] = g["value1"] * 2 return g - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(f) + result = grouped.apply(f) assert "value3" in result @@ -670,13 +635,9 @@ def test_apply_numeric_coercion_when_datetime(): df = DataFrame( {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) @@ -689,9 +650,7 @@ def test_apply_numeric_coercion_when_datetime_getitem(): def get_B(g): return g.iloc[0][["B"]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(get_B)["B"] + result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A tm.assert_series_equal(result, expected) @@ -718,11 +677,8 @@ def predictions(tool): ) df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df1.groupby("Key").apply(predictions).p1 - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df2.groupby("Key").apply(predictions).p1 + expected = df1.groupby("Key").apply(predictions).p1 + result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -737,13 +693,11 @@ def test_apply_aggregating_timedelta_and_datetime(): } ) df["time_delta_zero"] = df.datetime - df.datetime - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("clientid").apply( - lambda ddf: Series( - {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} - ) + result = df.groupby("clientid").apply( + lambda ddf: Series( + {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} ) + ) expected = DataFrame( { "clientid": ["A", "B", "C"], @@ -786,15 +740,11 @@ def func_with_no_date(batch): def func_with_date(batch): return Series({"b": datetime(2015, 1, 1), "c": 2}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) + dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) + dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( {"b": pd.Timestamp(2015, 1, 1), "c": 2}, index=[1] ) @@ -838,11 +788,8 @@ def test_groupby_apply_all_none(): def test_func(x): pass - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = test_df.groupby("groups").apply(test_func) - expected = DataFrame(columns=test_df.columns) - expected = expected.astype(test_df.dtypes) + result = test_df.groupby("groups").apply(test_func) + expected = DataFrame(columns=["random_vars"], dtype="int64") tm.assert_frame_equal(result, expected) @@ -852,12 +799,12 @@ def test_func(x): [ {"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}, [[1, 1], [0, 2]], - {"groups": [1, 1], "vars": [0, 2]}, + {"vars": [0, 2]}, ], [ {"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}, [[2, 2], [1, 3]], - {"groups": [2, 2], "vars": [1, 3]}, + {"vars": [1, 3]}, ], ], ) @@ -870,9 +817,7 @@ def test_func(x): return None return x.iloc[[0, -1]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = test_df1.groupby("groups").apply(test_func) + result1 = test_df1.groupby("groups").apply(test_func) index1 = MultiIndex.from_arrays(out_idx, names=["groups", None]) expected1 = DataFrame(out_data, index=index1) tm.assert_frame_equal(result1, expected1) @@ -882,9 +827,7 @@ def test_groupby_apply_return_empty_chunk(): # GH 22221: apply filter which returns some empty groups df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]}) groups = df.groupby("group") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = groups.apply(lambda group: group[group.value != 1]["value"]) + result = groups.apply(lambda group: group[group.value != 1]["value"]) expected = Series( [0], name="value", @@ -909,9 +852,7 @@ def test_apply_with_mixed_types(meth): def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=Index([1, 2])) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("a").apply(lambda g: g.index) + result = df.groupby("a").apply(lambda g: g.index) expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a")) tm.assert_series_equal(result, expected) @@ -928,9 +869,7 @@ def test_apply_datetime_issue(group_column_dtlike): # standard int values in range(len(num_columns)) df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) + result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) expected = DataFrame(["spam"], Index(["foo"], dtype="str", name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -967,9 +906,7 @@ def test_apply_series_return_dataframe_groups(): def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.items()}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = tdf.groupby("day").apply(most_common_values)["userId"] + result = tdf.groupby("day").apply(most_common_values)["userId"] expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) @@ -1010,13 +947,11 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): ], columns=["observation", "color", "mood", "intensity", "score"], ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes dtype = pd.StringDtype(na_value=np.nan) if using_infer_string else object expected = Series( - [np.dtype("datetime64[us]"), dtype, dtype, np.int64, dtype], - index=["observation", "color", "mood", "intensity", "score"], + [np.dtype("datetime64[us]"), dtype, np.int64, dtype], + index=["observation", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) @@ -1033,10 +968,8 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group", group_keys=False).apply(lambda x: x) - tm.assert_frame_equal(result, df) + result = df.groupby("group", group_keys=False).apply(lambda x: x) + tm.assert_frame_equal(result, df[["value"]]) @pytest.mark.parametrize( @@ -1058,9 +991,7 @@ def test_apply_index_has_complex_internals(index): def test_apply_function_returns_non_pandas_non_scalar(function, expected_values): # GH 31441 df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("groups").apply(function) + result = df.groupby("groups").apply(function) expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -1072,9 +1003,7 @@ def fct(group): df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(fct) + result = df.groupby("A").apply(fct) expected = Series( [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") ) @@ -1085,9 +1014,7 @@ def fct(group): def test_apply_function_index_return(function): # GH: 22541 df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("id").apply(function) + result = df.groupby("id").apply(function) expected = Series( [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], index=Index([1, 2, 3], name="id"), @@ -1123,9 +1050,7 @@ def test_apply_result_type(group_keys, udf): # We'd like to control whether the group keys end up in the index # regardless of whether the UDF happens to be a transform. df = DataFrame({"A": ["a", "b"], "B": [1, 2]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df_result = df.groupby("A", group_keys=group_keys).apply(udf) + df_result = df.groupby("A", group_keys=group_keys).apply(udf) series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) if group_keys: @@ -1140,11 +1065,8 @@ def test_result_order_group_keys_false(): # GH 34998 # apply result order should not depend on whether index is the same or just equal df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A", group_keys=False).apply(lambda x: x) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) + result = df.groupby("A", group_keys=False).apply(lambda x: x) + expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) tm.assert_frame_equal(result, expected) @@ -1156,15 +1078,8 @@ def test_apply_with_timezones_aware(): df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz}) df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = df1.groupby("x", group_keys=False).apply( - lambda df: df[["x", "y"]].copy() - ) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = df2.groupby("x", group_keys=False).apply( - lambda df: df[["x", "y"]].copy() - ) + result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["y"]].copy()) + result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["y"]].copy()) tm.assert_frame_equal(result1, result2) @@ -1187,7 +1102,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): # Check output when no other methods are called before .apply() grp = df.groupby(by="a") - result = grp.apply(np.sum, axis=0, include_groups=False) + result = grp.apply(np.sum, axis=0) tm.assert_frame_equal(result, expected) # Check output when another method is called before .apply() @@ -1201,7 +1116,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): msg = "" with tm.assert_produces_warning(warn, match=msg): _ = getattr(grp, reduction_func)(*args) - result = grp.apply(np.sum, axis=0, include_groups=False) + result = grp.apply(np.sum, axis=0) tm.assert_frame_equal(result, expected) @@ -1223,14 +1138,12 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): ) grp = df.groupby(["A", "B"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grp.apply(lambda x: x.head(1)) + result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]]) - expected = expected.drop(columns=["idx"]) + expected = expected.drop(columns=["A", "B", "idx"]) tm.assert_frame_equal(result, expected) for val in result.index.levels[1]: @@ -1247,10 +1160,8 @@ def test_apply_dropna_with_indexed_same(dropna): }, index=list("xxyxz"), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) - expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] + result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) + expected = df.dropna()[["col"]] if dropna else df[["col"]].iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected) @@ -1274,9 +1185,7 @@ def test_apply_dropna_with_indexed_same(dropna): def test_apply_as_index_constant_lambda(as_index, expected): # GH 13217 df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) + result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) tm.assert_equal(result, expected) @@ -1286,9 +1195,7 @@ def test_sort_index_groups(): {"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]}, index=range(5), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("C").apply(lambda x: x.A.sort_index()) + result = df.groupby("C").apply(lambda x: x.A.sort_index()) expected = Series( range(1, 6), index=MultiIndex.from_tuples( @@ -1308,12 +1215,10 @@ def test_positional_slice_groups_datetimelike(): "let": list("abcde"), } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = expected.groupby( - [expected.let, expected.date.dt.date], group_keys=False - ).apply(lambda x: x.iloc[0:]) - tm.assert_frame_equal(result, expected) + result = expected.groupby( + [expected.let, expected.date.dt.date], group_keys=False + ).apply(lambda x: x.iloc[0:]) + tm.assert_frame_equal(result, expected[["date", "vals"]]) def test_groupby_apply_shape_cache_safety(): @@ -1354,32 +1259,27 @@ def test_apply_na(dropna): {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]} ) dfgrp = df.groupby("grp", dropna=dropna) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) + result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) + expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) tm.assert_frame_equal(result, expected) def test_apply_empty_string_nan_coerce_bug(): # GH#24903 - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = ( - DataFrame( - { - "a": [1, 1, 2, 2], - "b": ["", "", "", ""], - "c": pd.to_datetime([1, 2, 3, 4], unit="s"), - } - ) - .groupby(["a", "b"]) - .apply(lambda df: df.iloc[-1]) + result = ( + DataFrame( + { + "a": [1, 1, 2, 2], + "b": ["", "", "", ""], + "c": pd.to_datetime([1, 2, 3, 4], unit="s"), + } ) + .groupby(["a", "b"]) + .apply(lambda df: df.iloc[-1]) + ) expected = DataFrame( - [[1, "", pd.to_datetime(2, unit="s")], [2, "", pd.to_datetime(4, unit="s")]], - columns=["a", "b", "c"], + [[pd.to_datetime(2, unit="s")], [pd.to_datetime(4, unit="s")]], + columns=["c"], index=MultiIndex.from_tuples([(1, ""), (2, "")], names=["a", "b"]), ) tm.assert_frame_equal(result, expected) @@ -1401,11 +1301,9 @@ def test_apply_index_key_error_bug(index_values): }, index=Index(["a2", "a3", "aa"], name="a"), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = result.groupby("a").apply( - lambda df: Series([df["b"].mean()], index=["b_mean"]) - ) + result = result.groupby("a").apply( + lambda df: Series([df["b"].mean()], index=["b_mean"]) + ) tm.assert_frame_equal(result, expected) @@ -1452,10 +1350,9 @@ def test_apply_index_key_error_bug(index_values): ) def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 - expected = DataFrame({"col": arg}, index=idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = expected.groupby("col", group_keys=False).apply(lambda x: x) + df = DataFrame({"grp": arg, "col": arg}, index=idx) + result = df.groupby("grp", group_keys=False).apply(lambda x: x) + expected = df[["col"]] tm.assert_frame_equal(result, expected) @@ -1484,6 +1381,7 @@ def test_result_name_when_one_group(name): ("apply", lambda gb: gb.values[-1]), ("apply", lambda gb: gb["b"].iloc[0]), ("agg", "skew"), + ("agg", "kurt"), ("agg", "prod"), ("agg", "sum"), ], @@ -1502,19 +1400,12 @@ def test_empty_df(method, op): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("include_groups", [True, False]) -def test_include_groups(include_groups): +def test_include_groups(): # GH#7155 df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) gb = df.groupby("a") - warn = DeprecationWarning if include_groups else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - result = gb.apply(lambda x: x.sum(), include_groups=include_groups) - expected = DataFrame({"a": [2, 2], "b": [7, 5]}, index=Index([1, 2], name="a")) - if not include_groups: - expected = expected[["b"]] - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match="include_groups=True is no longer allowed"): + gb.apply(lambda x: x.sum(), include_groups=True) @pytest.mark.parametrize("func, value", [(max, 2), (min, 1), (sum, 3)]) @@ -1523,7 +1414,7 @@ def test_builtins_apply(func, value): # Builtins act as e.g. sum(group), which sums the column labels of group df = DataFrame({0: [1, 1, 2], 1: [3, 4, 5], 2: [3, 4, 5]}) gb = df.groupby(0) - result = gb.apply(func, include_groups=False) + result = gb.apply(func) expected = Series([value, value], index=Index([1, 2], name=0)) tm.assert_series_equal(result, expected) @@ -1544,9 +1435,7 @@ def f_0(grp): return grp.iloc[0] expected = df.groupby("A").first()[["B"]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_0)[["B"]] + result = df.groupby("A").apply(f_0)[["B"]] tm.assert_frame_equal(result, expected) def f_1(grp): @@ -1554,9 +1443,7 @@ def f_1(grp): return None return grp.iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_1)[["B"]] + result = df.groupby("A").apply(f_1)[["B"]] e = expected.copy() e.loc["Tiger"] = np.nan tm.assert_frame_equal(result, e) @@ -1566,9 +1453,7 @@ def f_2(grp): return None return grp.iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_2)[["B"]] + result = df.groupby("A").apply(f_2)[["B"]] e = expected.copy() e.loc["Pony"] = np.nan tm.assert_frame_equal(result, e) @@ -1579,9 +1464,7 @@ def f_3(grp): return None return grp.iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_3)[["C"]] + result = df.groupby("A").apply(f_3)[["C"]] e = df.groupby("A").first()[["C"]] e.loc["Pony"] = pd.NaT tm.assert_frame_equal(result, e) @@ -1592,10 +1475,42 @@ def f_4(grp): return None return grp.iloc[0].loc["C"] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_4) + result = df.groupby("A").apply(f_4) e = df.groupby("A").first()["C"].copy() e.loc["Pony"] = np.nan e.name = None tm.assert_series_equal(result, e) + + +def test_nonreducer_nonstransform(): + # GH3380, GH60619 + # Was originally testing mutating in a UDF; now kept as an example + # of using apply with a nonreducer and nontransformer. + df = DataFrame( + { + "cat1": ["a"] * 8 + ["b"] * 6, + "cat2": ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2 + + ["f"] * 2 + + ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2, + "val": np.random.default_rng(2).integers(100, size=14), + } + ) + + def f(x): + x = x.copy() + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() + + expected = DataFrame( + { + "cat1": list("aaaabbb"), + "cat2": list("cdefcde"), + "rank": [3.0, 2.0, 5.0, 1.0, 2.0, 4.0, 1.0], + } + ).set_index(["cat1", "cat2"])["rank"] + result = df.groupby("cat1").apply(f) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py deleted file mode 100644 index fa20efad4da77..0000000000000 --- a/pandas/tests/groupby/test_apply_mutate.py +++ /dev/null @@ -1,96 +0,0 @@ -import numpy as np - -import pandas as pd -import pandas._testing as tm - - -def test_group_by_copy(): - # GH#44803 - df = pd.DataFrame( - { - "name": ["Alice", "Bob", "Carl"], - "age": [20, 21, 20], - } - ).set_index("name") - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grp_by_same_value = df.groupby(["age"], group_keys=False).apply( - lambda group: group - ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grp_by_copy = df.groupby(["age"], group_keys=False).apply( - lambda group: group.copy() - ) - tm.assert_frame_equal(grp_by_same_value, grp_by_copy) - - -def test_mutate_groups(): - # GH3380 - - df = pd.DataFrame( - { - "cat1": ["a"] * 8 + ["b"] * 6, - "cat2": ["c"] * 2 - + ["d"] * 2 - + ["e"] * 2 - + ["f"] * 2 - + ["c"] * 2 - + ["d"] * 2 - + ["e"] * 2, - "cat3": [f"g{x}" for x in range(1, 15)], - "val": np.random.default_rng(2).integers(100, size=14), - } - ) - - def f_copy(x): - x = x.copy() - x["rank"] = x.val.rank(method="min") - return x.groupby("cat2")["rank"].min() - - def f_no_copy(x): - x["rank"] = x.val.rank(method="min") - return x.groupby("cat2")["rank"].min() - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grpby_copy = df.groupby("cat1").apply(f_copy) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grpby_no_copy = df.groupby("cat1").apply(f_no_copy) - tm.assert_series_equal(grpby_copy, grpby_no_copy) - - -def test_no_mutate_but_looks_like(): - # GH 8467 - # first show's mutation indicator - # second does not, but should yield the same results - df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) - tm.assert_series_equal(result1, result2) - - -def test_apply_function_with_indexing(): - # GH: 33058 - df = pd.DataFrame( - {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} - ) - - def fn(x): - x.loc[x.index[-1], "col2"] = 0 - return x.col2 - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["col1"], as_index=False).apply(fn) - expected = pd.Series( - [1, 2, 0, 4, 5, 0], - index=range(6), - name="col2", - ) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 1e86b5401ee09..20309e852a556 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -63,6 +61,7 @@ def f(a): "sem": np.nan, "size": 0, "skew": np.nan, + "kurt": np.nan, "std": np.nan, "sum": 0, "var": np.nan, @@ -129,10 +128,8 @@ def test_basic_string(using_infer_string): def f(x): return x.drop_duplicates("person_name").iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(f) - expected = x.iloc[[0, 1]].copy() + result = g.apply(f) + expected = x[["person_name"]].iloc[[0, 1]] expected.index = Index([1, 2], name="person_id") dtype = "str" if using_infer_string else object expected["person_name"] = expected["person_name"].astype(dtype) @@ -316,14 +313,11 @@ def test_apply(ordered): # but for transform we should still get back the original index idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(lambda x: 1) + result = grouped.apply(lambda x: 1) tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_observed(observed): +def test_observed(request, using_infer_string, observed): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -331,6 +325,10 @@ def test_observed(observed): # gh-8138 (back-compat) # gh-8869 + if using_infer_string and not observed: + # TODO(infer_string) this fails with filling the string column with 0 + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) @@ -1356,11 +1354,7 @@ def test_get_nonexistent_category(): # Accessing a Category that is not in the dataframe df = DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) with pytest.raises(KeyError, match="'vau'"): - df.groupby("var").apply( - lambda rows: DataFrame( - {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} - ) - ) + df.groupby("var").apply(lambda rows: DataFrame({"val": [rows.iloc[-1]["vau"]]})) def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed): @@ -1962,10 +1956,7 @@ def test_category_order_transformer( df = df.set_index(keys) args = get_groupby_method_args(transformation_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - warn = FutureWarning if transformation_func == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - op_result = getattr(gb, transformation_func)(*args) + op_result = getattr(gb, transformation_func)(*args) result = op_result.index.get_level_values("a").categories expected = Index([1, 4, 3, 2]) tm.assert_index_equal(result, expected) @@ -2036,10 +2027,7 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde df["a2"] = df["a"] df = df.set_index(keys) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - warn = DeprecationWarning if method == "apply" and index_kind == "range" else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) + op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) if (method == "transform" or not as_index) and index_kind == "range": result = op_result["a"].cat.categories else: diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 47ad18c9ad2c8..679f7eb7f7f11 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -289,9 +289,7 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + right = df.groupby(key).apply(DataFrame.count) tm.assert_frame_equal(left, right) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0d13db79835ba..5bae9b1fd9882 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError import pandas.util._test_decorators as td @@ -66,11 +64,9 @@ def test_groupby_nonobject_dtype_mixed(): def max_value(group): return group.loc[group["value"].idxmax()] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - applied = df.groupby("A").apply(max_value) + applied = df.groupby("A").apply(max_value) result = applied.dtypes - expected = df.dtypes + expected = df.drop(columns="A").dtypes tm.assert_series_equal(result, expected) @@ -229,11 +225,8 @@ def f3(x): df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) # correct result - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = df.groupby("a").apply(f1) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = df2.groupby("a").apply(f1) + result1 = df.groupby("a").apply(f1) + result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) # should fail (not the same number of levels) @@ -425,7 +418,7 @@ def test_frame_multi_key_function_list(): tm.assert_frame_equal(agged, expected) -def test_frame_multi_key_function_list_partial_failure(): +def test_frame_multi_key_function_list_partial_failure(using_infer_string): data = DataFrame( { "A": [ @@ -476,6 +469,8 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = ["mean", "std"] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -662,9 +657,11 @@ def test_groupby_multi_corner(df): tm.assert_frame_equal(agged, expected) -def test_raises_on_nuisance(df): +def test_raises_on_nuisance(df, using_infer_string): grouped = df.groupby("A") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -699,7 +696,7 @@ def test_keep_nuisance_agg(df, agg_function): ["sum", "mean", "prod", "std", "var", "sem", "median"], ) @pytest.mark.parametrize("numeric_only", [True, False]) -def test_omit_nuisance_agg(df, agg_function, numeric_only): +def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string): # GH 38774, GH 38815 grouped = df.groupby("A") @@ -707,7 +704,10 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): if agg_function in no_drop_nuisance and not numeric_only: # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False - if agg_function in ("std", "sem"): + if using_infer_string: + msg = f"dtype 'str' does not support operation '{agg_function}'" + klass = TypeError + elif agg_function in ("std", "sem"): klass = ValueError msg = "could not convert string to float: 'one'" else: @@ -728,16 +728,24 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): tm.assert_frame_equal(result, expected) -def test_raise_on_nuisance_python_single(df): +def test_raise_on_nuisance_python_single(df, using_infer_string): # GH 38815 grouped = df.groupby("A") - with pytest.raises(ValueError, match="could not convert"): + + err = ValueError + msg = "could not convert" + if using_infer_string: + err = TypeError + msg = "dtype 'str' does not support operation 'skew'" + with pytest.raises(err, match=msg): grouped.skew() -def test_raise_on_nuisance_python_multiple(three_group): +def test_raise_on_nuisance_python_multiple(three_group, using_infer_string): grouped = three_group.groupby(["A", "B"]) msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -775,12 +783,16 @@ def test_nonsense_func(): df.groupby(lambda x: x + "foo") -def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data): +def test_wrap_aggregated_output_multindex( + multiindex_dataframe_random_data, using_infer_string +): df = multiindex_dataframe_random_data.T df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): df.groupby(keys).agg("mean") agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") @@ -960,8 +972,10 @@ def test_groupby_with_hier_columns(): def test_grouping_ndarray(df): grouped = df.groupby(df["A"].values) + grouped2 = df.groupby(df["A"].rename(None)) + result = grouped.sum() - expected = df.groupby(df["A"].rename(None)).sum() + expected = grouped2.sum() tm.assert_frame_equal(result, expected) @@ -1034,17 +1048,13 @@ def summarize_random_name(df): # Provide a different name for each Series. In this case, groupby # should not attempt to propagate the Series name since they are # inconsistent. - return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) + return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["C"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - metrics = df.groupby("A").apply(summarize) + metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - with tm.assert_produces_warning(DeprecationWarning, match=msg): - metrics = df.groupby("A").apply(summarize, "metrics") + metrics = df.groupby("A").apply(summarize, "metrics") assert metrics.columns.name == "metrics" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - metrics = df.groupby("A").apply(summarize_random_name) + metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1260,7 +1270,6 @@ def test_groupby_two_group_keys_all_nan(): assert result == {} -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) d["group"] = ["g1", "g2"] @@ -1269,7 +1278,7 @@ def test_groupby_2d_malformed(): d["label"] = ["l1", "l2"] tmp = d.groupby(["group"]).mean(numeric_only=True) res_values = np.array([[0.0, 1.0], [0.0, 1.0]]) - tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) + tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object)) tm.assert_numpy_array_equal(tmp.values, res_values) @@ -1341,10 +1350,8 @@ def test_dont_clobber_name_column(): {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("key", group_keys=False).apply(lambda x: x) - tm.assert_frame_equal(result, df) + result = df.groupby("key", group_keys=False).apply(lambda x: x) + tm.assert_frame_equal(result, df[["name"]]) def test_skip_group_keys(): @@ -1421,9 +1428,7 @@ def freducex(x): grouped = df.groupby(grouper, group_keys=False) # make sure all these work - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grouped.apply(f) + grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({"C": freduce, "D": freduce}) grouped.transform(f) @@ -1444,10 +1449,7 @@ def f(group): names.append(group.name) return group.copy() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("a", sort=False, group_keys=False).apply(f) - + df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] assert names == expected_names @@ -1457,8 +1459,8 @@ def test_no_dummy_key_names(df): result = df.groupby(df["A"].values).sum() assert result.index.name is None - result = df.groupby([df["A"].values, df["B"].values]).sum() - assert result.index.names == (None, None) + result2 = df.groupby([df["A"].values, df["B"].values]).sum() + assert result2.index.names == (None, None) def test_groupby_sort_multiindex_series(): @@ -1652,9 +1654,7 @@ def test_groupby_preserves_sort(sort_column, group_column): def test_sort(x): tm.assert_frame_equal(x, x.sort_values(by=sort_column)) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - g.apply(test_sort) + g.apply(test_sort) def test_pivot_table_values_key_error(): @@ -1708,7 +1708,7 @@ def test_pivot_table_values_key_error(): ) @pytest.mark.parametrize("method", ["attr", "agg", "apply"]) @pytest.mark.parametrize( - "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] + "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew", "kurt"] ) def test_empty_groupby(columns, keys, values, method, op, dropna, using_infer_string): # GH8093 & GH26411 @@ -1761,6 +1761,7 @@ def get_categorical_invalid_expected(): is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) is_dt64 = df.dtypes.iloc[0].kind == "M" is_cat = isinstance(values, Categorical) + is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype) if ( isinstance(values, Categorical) @@ -1783,26 +1784,28 @@ def get_categorical_invalid_expected(): tm.assert_equal(result, expected) return - if op in ["prod", "sum", "skew"]: + if op in ["prod", "sum", "skew", "kurt"]: # ops that require more than just ordered-ness - if is_dt64 or is_cat or is_per: + if is_dt64 or is_cat or is_per or (is_str and op != "sum"): # GH#41291 # datetime64 -> prod and sum are invalid if is_dt64: msg = "datetime64 type does not support" elif is_per: msg = "Period type does not support" + elif is_str: + msg = f"dtype 'str' does not support operation '{op}'" else: msg = "category type does not support" - if op == "skew": - msg = "|".join([msg, "does not support operation 'skew'"]) + if op in ["skew", "kurt"]: + msg = "|".join([msg, f"does not support operation '{op}'"]) with pytest.raises(TypeError, match=msg): get_result() if not isinstance(columns, list): # i.e. SeriesGroupBy return - elif op == "skew": + elif op in ["skew", "kurt"]: # TODO: test the numeric_only=True case return else: @@ -1837,10 +1840,8 @@ def test_empty_groupby_apply_nonunique_columns(): df[3] = df[3].astype(np.int64) df.columns = [0, 1, 2, 0] gb = df.groupby(df[1], group_keys=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res = gb.apply(lambda x: x) - assert (res.dtypes == df.dtypes).all() + res = gb.apply(lambda x: x) + assert (res.dtypes == df.drop(columns=1).dtypes).all() def test_tuple_as_grouping(): @@ -2075,36 +2076,14 @@ def test_group_on_empty_multiindex(transformation_func, request): df["col_3"] = df["col_3"].astype(int) df["col_4"] = df["col_4"].astype(int) df = df.set_index(["col_1", "col_2"]) - if transformation_func == "fillna": - args = ("ffill",) - else: - args = () - warn = FutureWarning if transformation_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) - with tm.assert_produces_warning(warn, match=warn_msg): - expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] + result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func) + expected = df.groupby(["col_1"]).transform(transformation_func).iloc[:0] if transformation_func in ("diff", "shift"): expected = expected.astype(int) tm.assert_equal(result, expected) - warn_msg = "SeriesGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = ( - df["col_3"] - .iloc[:0] - .groupby(["col_1"]) - .transform(transformation_func, *args) - ) - warn_msg = "SeriesGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - expected = ( - df["col_3"] - .groupby(["col_1"]) - .transform(transformation_func, *args) - .iloc[:0] - ) + result = df["col_3"].iloc[:0].groupby(["col_1"]).transform(transformation_func) + expected = df["col_3"].groupby(["col_1"]).transform(transformation_func).iloc[:0] if transformation_func in ("diff", "shift"): expected = expected.astype(int) tm.assert_equal(result, expected) @@ -2321,7 +2300,6 @@ def test_groupby_all_nan_groups_drop(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_empty_multi_column(as_index, numeric_only): # GH 15106 & GH 41998 @@ -2330,7 +2308,7 @@ def test_groupby_empty_multi_column(as_index, numeric_only): result = gb.sum(numeric_only=numeric_only) if as_index: index = MultiIndex([[], []], [[], []], names=["A", "B"]) - columns = ["C"] if not numeric_only else [] + columns = ["C"] if not numeric_only else Index([], dtype="str") else: index = RangeIndex(0) columns = ["A", "B", "C"] if not numeric_only else ["A", "B"] @@ -2338,7 +2316,6 @@ def test_groupby_empty_multi_column(as_index, numeric_only): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_aggregation_non_numeric_dtype(): # GH #43108 df = DataFrame( @@ -2349,7 +2326,7 @@ def test_groupby_aggregation_non_numeric_dtype(): { "v": [[1, 1], [10, 20]], }, - index=Index(["M", "W"], dtype="object", name="MW"), + index=Index(["M", "W"], name="MW"), ) gb = df.groupby(by=["MW"]) @@ -2489,12 +2466,13 @@ def test_groupby_none_in_first_mi_level(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_groupby_none_column_name(): +def test_groupby_none_column_name(using_infer_string): # GH#47348 df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]}) - result = df.groupby(by=[None]).sum() - expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None)) + by = [np.nan] if using_infer_string else [None] + gb = df.groupby(by=by) + result = gb.sum() + expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=by[0])) tm.assert_frame_equal(result, expected) @@ -2714,7 +2692,7 @@ def test_obj_with_exclusions_duplicate_columns(): def test_groupby_numeric_only_std_no_result(numeric_only): # GH 51080 dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}] - df = DataFrame(dicts_non_numeric) + df = DataFrame(dicts_non_numeric, dtype=object) dfgb = df.groupby("a", as_index=False, sort=False) if numeric_only: @@ -2773,10 +2751,14 @@ def test_grouping_with_categorical_interval_columns(): def test_groupby_sum_on_nan_should_return_nan(bug_var): # GH 24196 df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]}) + if isinstance(bug_var, str): + df = df.astype(object) dfgb = df.groupby(lambda x: x) result = dfgb.sum(min_count=1) - expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"]) + expected_df = DataFrame( + [bug_var, bug_var, bug_var, None], columns=["A"], dtype=df["A"].dtype + ) tm.assert_frame_equal(result, expected_df) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d42aa06d6bbfe..8c4ab42b7be7a 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -99,7 +97,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( tm.assert_frame_equal(grouped, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, idx, outputs", [ @@ -126,7 +123,7 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) grouped = df.groupby("a", dropna=dropna).sum() - expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) + expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a")) tm.assert_frame_equal(grouped, expected) @@ -326,9 +323,7 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) + result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 0832b67b38098..3ee9c9ea0c7fd 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -36,11 +36,11 @@ def test_groupby_preserves_subclass(obj, groupby_func): args = get_groupby_method_args(groupby_func, obj) - warn = FutureWarning if groupby_func == "fillna" else None - msg = f"{type(grouped).__name__}.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): + warn = FutureWarning if groupby_func == "corrwith" else None + msg = f"{type(grouped).__name__}.corrwith is deprecated" + with tm.assert_produces_warning(warn, match=msg): result1 = getattr(grouped, groupby_func)(*args) - with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): + with tm.assert_produces_warning(warn, match=msg): result2 = grouped.agg(groupby_func, *args) # Reduction or transformation kernels should preserve type @@ -72,18 +72,11 @@ def func(group): assert group.testattr == "hello" return group.testattr - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning( - DeprecationWarning, - match=msg, - raise_on_extra_warnings=False, - check_stacklevel=False, - ): - result = custom_df.groupby("c").apply(func) + result = custom_df.groupby("c").apply(func) expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) tm.assert_series_equal(result, expected) - result = custom_df.groupby("c").apply(func, include_groups=False) + result = custom_df.groupby("c").apply(func) tm.assert_series_equal(result, expected) # https://github.com/pandas-dev/pandas/pull/56761 @@ -109,7 +102,7 @@ def test_groupby_resample_preserves_subclass(obj): df = obj( { - "Buyer": "Carl Carl Carl Carl Joe Carl".split(), + "Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object), "Quantity": [18, 3, 5, 1, 9, 3], "Date": [ datetime(2013, 9, 1, 13, 0), @@ -124,12 +117,5 @@ def test_groupby_resample_preserves_subclass(obj): df = df.set_index("Date") # Confirm groupby.resample() preserves dataframe type - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning( - DeprecationWarning, - match=msg, - raise_on_extra_warnings=False, - check_stacklevel=False, - ): - result = df.groupby("Buyer").resample("5D").sum() + result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 6bb2eaf89b5d7..53e9c53efebf7 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -10,8 +10,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError import pandas as pd @@ -235,11 +233,7 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(lambda x: x.sum()) - expected["A"] = [0, 2, 4] - expected = expected.loc[:, ["A", "B"]] + result = g.apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) def test_grouper_creation_bug2(self): @@ -779,10 +773,21 @@ def test_evaluate_with_empty_groups(self, func, expected): # (not testing other agg fns, because they return # different index objects. df = DataFrame({1: [], 2: []}) - g = df.groupby(1, group_keys=False) + g = df.groupby(1, group_keys=True) result = getattr(g[2], func)(lambda x: x) tm.assert_series_equal(result, expected) + def test_groupby_apply_empty_with_group_keys_false(self): + # 60471 + # test apply'ing empty groups with group_keys False + # (not testing other agg fns, because they return + # different index objects. + df = DataFrame({"A": [], "B": [], "C": []}) + g = df.groupby("A", group_keys=False) + result = g.apply(lambda x: x / x.sum()) + expected = DataFrame({"B": [], "C": []}, index=None) + tm.assert_frame_equal(result, expected) + def test_groupby_empty(self): # https://github.com/pandas-dev/pandas/issues/27190 s = Series([], name="name", dtype="float64") @@ -807,7 +812,6 @@ def test_groupby_empty(self): expected = ["name"] assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_level_index_value_all_na(self): # issue 20519 df = DataFrame( @@ -817,7 +821,7 @@ def test_groupby_level_index_value_all_na(self): expected = DataFrame( data=[], index=MultiIndex( - levels=[Index(["x"], dtype="object"), Index([], dtype="float64")], + levels=[Index(["x"], dtype="str"), Index([], dtype="float64")], codes=[[], []], names=["A", "B"], ), @@ -864,9 +868,7 @@ def test_groupby_tuple_keys_handle_multiindex(self): } ) expected = df.sort_values(by=["category_tuple", "num1"]) - result = df.groupby("category_tuple").apply( - lambda x: x.sort_values(by="num1"), include_groups=False - ) + result = df.groupby("category_tuple").apply(lambda x: x.sort_values(by="num1")) expected = expected[result.columns] tm.assert_frame_equal(result.reset_index(drop=True), expected) @@ -981,12 +983,13 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_single_column(self): df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) # GH 13530 - exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[]) + exp = DataFrame( + index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype="str") + ) tm.assert_frame_equal(df.groupby("a").count(), exp) tm.assert_frame_equal(df.groupby("a").sum(), exp) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 41e00f8121b14..99a88a5d8fe7c 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -28,7 +28,8 @@ def df(self): "group": [1, 1, 2], "int": [1, 2, 3], "float": [4.0, 5.0, 6.0], - "string": list("abc"), + "string": Series(["a", "b", "c"], dtype="str"), + "object": Series(["a", "b", "c"], dtype=object), "category_string": Series(list("abc")).astype("category"), "category_int": [7, 8, 9], "datetime": date_range("20130101", periods=3), @@ -40,6 +41,7 @@ def df(self): "int", "float", "string", + "object", "category_string", "category_int", "datetime", @@ -112,6 +114,7 @@ def test_first_last(self, df, method): "int", "float", "string", + "object", "category_string", "category_int", "datetime", @@ -159,7 +162,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): # object dtypes for transformations are not implemented in Cython and # have no Python fallback - exception = NotImplementedError if method.startswith("cum") else TypeError + exception = ( + (NotImplementedError, TypeError) if method.startswith("cum") else TypeError + ) if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"): # The methods default to numeric_only=False and raise TypeError @@ -170,6 +175,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): re.escape(f"agg function failed [how->{method},dtype->object]"), # cumsum/cummin/cummax/cumprod "function is not implemented for this dtype", + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -180,7 +186,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), - re.escape(f"agg function failed [how->{method},dtype->str]"), + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -198,7 +204,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), - re.escape(f"agg function failed [how->{method},dtype->str]"), + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -238,6 +244,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): ("quantile", True), ("sem", True), ("skew", True), + ("kurt", True), ("std", True), ("sum", True), ("var", True), @@ -272,14 +279,11 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): kernel in ("first", "last") or ( # kernels that work on any dtype and don't have numeric_only arg - kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique") + kernel in ("any", "all", "bfill", "ffill", "nth", "nunique") and numeric_only is lib.no_default ) ): - warn = FutureWarning if kernel == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - result = method(*args, **kwargs) + result = method(*args, **kwargs) assert "b" in result.columns elif has_arg: assert numeric_only is not True @@ -299,7 +303,9 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): re.escape(f"agg function failed [how->{kernel},dtype->object]"), ] ) - if kernel == "idxmin": + if kernel == "quantile": + msg = "dtype 'object' does not support operation 'quantile'" + elif kernel == "idxmin": msg = "'<' not supported between instances of 'type' and 'type'" elif kernel == "idxmax": msg = "'>' not supported between instances of 'type' and 'type'" @@ -373,13 +379,14 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): "max", "prod", "skew", + "kurt", ) # Test default behavior; kernels that fail may be enabled in the future but kernels # that succeed should not be allowed to fail (without deprecation, at least) if groupby_func in fails_on_numeric_object and dtype is object: if groupby_func == "quantile": - msg = "cannot be performed against 'object' dtypes" + msg = "dtype 'object' does not support operation 'quantile'" else: msg = "is not supported for object dtype" with pytest.raises(TypeError, match=msg): @@ -402,6 +409,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): "quantile", "sem", "skew", + "kurt", "std", "sum", "var", diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 1044c83e3e56b..ee59a93695bcf 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -1,7 +1,4 @@ import numpy as np -import pytest - -from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -11,7 +8,6 @@ import pandas._testing as tm -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_pipe(): # Test the pipe method of DataFrameGroupBy. # Issue #17871 @@ -39,7 +35,7 @@ def square(srs): # NDFrame.pipe methods result = df.groupby("A").pipe(f).pipe(square) - index = Index(["bar", "foo"], dtype="object", name="A") + index = Index(["bar", "foo"], name="A") expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index) tm.assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 38b4abfddda1e..ba13d3bd7278f 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( Categorical, DataFrame, @@ -106,10 +104,9 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): gb.transform(groupby_func, *args) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_string( - how, by, groupby_series, groupby_func, df_with_string_col + how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string ): df = df_with_string_col args = get_groupby_method_args(groupby_func, df) @@ -147,7 +144,6 @@ def test_groupby_raises_string( ), "diff": (TypeError, "unsupported operand type"), "ffill": (None, ""), - "fillna": (None, ""), "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -169,12 +165,13 @@ def test_groupby_raises_string( TypeError, re.escape("agg function failed [how->prod,dtype->object]"), ), - "quantile": (TypeError, "cannot be performed against 'object' dtypes!"), + "quantile": (TypeError, "dtype 'object' does not support operation 'quantile'"), "rank": (None, ""), "sem": (ValueError, "could not convert string to float"), "shift": (None, ""), "size": (None, ""), "skew": (ValueError, "could not convert string to float"), + "kurt": (ValueError, "could not convert string to float"), "std": (ValueError, "could not convert string to float"), "sum": (None, ""), "var": ( @@ -183,10 +180,39 @@ def test_groupby_raises_string( ), }[groupby_func] - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" - elif groupby_func == "corrwith": + if using_infer_string: + if groupby_func in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + "kurt", + "quantile", + ]: + msg = f"dtype 'str' does not support operation '{groupby_func}'" + if groupby_func in ["sem", "std", "skew", "kurt"]: + # The object-dtype raises ValueError when trying to convert to numeric. + klass = TypeError + elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow": + # This doesn't go through EA._groupby_op so the message isn't controlled + # there. + msg = "operation 'truediv' not supported for dtype 'str' with dtype 'str'" + elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow": + # This doesn't go through EA._groupby_op so the message isn't controlled + # there. + msg = "operation 'sub' not supported for dtype 'str' with dtype 'str'" + + elif groupby_func in ["cummin", "cummax"]: + msg = msg.replace("object", "str") + elif groupby_func == "corrwith": + msg = "Cannot perform reduction 'mean' with string dtype" + + if groupby_func == "corrwith": warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" @@ -211,7 +237,12 @@ def func(x): @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( - how, by, groupby_series, groupby_func_np, df_with_string_col + how, + by, + groupby_series, + groupby_func_np, + df_with_string_col, + using_infer_string, ): # GH#50749 df = df_with_string_col @@ -228,6 +259,15 @@ def test_groupby_raises_string_np( "Cannot perform reduction 'mean' with string dtype", ), }[groupby_func_np] + + if using_infer_string: + if groupby_func_np is np.mean: + klass = TypeError + msg = ( + f"Cannot perform reduction '{groupby_func_np.__name__}' " + "with string dtype" + ) + _call_and_check(klass, msg, how, gb, groupby_func_np, ()) @@ -259,7 +299,6 @@ def test_groupby_raises_datetime( "cumsum": (TypeError, "datetime64 type does not support operation 'cumsum'"), "diff": (None, ""), "ffill": (None, ""), - "fillna": (None, ""), "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -286,15 +325,21 @@ def test_groupby_raises_datetime( ] ), ), + "kurt": ( + TypeError, + "|".join( + [ + r"dtype datetime64\[ns\] does not support operation", + "datetime64 type does not support operation 'kurt'", + ] + ), + ), "std": (None, ""), "sum": (TypeError, "datetime64 type does not support operation 'sum"), "var": (TypeError, "datetime64 type does not support operation 'var'"), }[groupby_func] - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" - elif groupby_func == "corrwith": + if groupby_func == "corrwith": warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" @@ -338,7 +383,7 @@ def test_groupby_raises_datetime_np( _call_and_check(klass, msg, how, gb, groupby_func_np, ()) -@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"]) +@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "kurt", "var"]) def test_groupby_raises_timedelta(func): df = DataFrame( { @@ -415,7 +460,6 @@ def test_groupby_raises_category( r"unsupported operand type\(s\) for -: 'Categorical' and 'Categorical'", ), "ffill": (None, ""), - "fillna": (None, ""), # no-op with CoW "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -469,6 +513,15 @@ def test_groupby_raises_category( ] ), ), + "kurt": ( + TypeError, + "|".join( + [ + "dtype category does not support operation 'kurt'", + "category type does not support kurt operations", + ] + ), + ), "std": ( TypeError, "|".join( @@ -490,10 +543,7 @@ def test_groupby_raises_category( ), }[groupby_func] - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" - elif groupby_func == "corrwith": + if groupby_func == "corrwith": warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" @@ -608,7 +658,6 @@ def test_groupby_raises_category_on_category( ), "diff": (TypeError, "unsupported operand type"), "ffill": (None, ""), - "fillna": (None, ""), # no-op with CoW "first": (None, ""), "idxmax": (ValueError, "empty group due to unobserved categories") if empty_groups @@ -647,6 +696,15 @@ def test_groupby_raises_category_on_category( ] ), ), + "kurt": ( + TypeError, + "|".join( + [ + "category type does not support kurt operations", + "dtype category does not support operation 'kurt'", + ] + ), + ), "std": ( TypeError, "|".join( @@ -668,10 +726,7 @@ def test_groupby_raises_category_on_category( ), }[groupby_func] - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" - elif groupby_func == "corrwith": + if groupby_func == "corrwith": warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index a6ea1502103c5..ea876cfdf4933 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import iNaT from pandas.core.dtypes.common import pandas_dtype @@ -424,6 +422,239 @@ def test_mean_on_timedelta(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "values, dtype, result_dtype", + [ + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "float64", "float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Float64", "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Int64", "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "timedelta64[ns]", "timedelta64[ns]"), + ( + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ], +) +def test_mean_skipna(values, dtype, result_dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the result_dtype because + # Series.mean() changes the dtype to float64/object depending on the input dtype + expected = ( + df.groupby("cat")["val"] + .apply(lambda x: x.mean(skipna=skipna)) + .astype(result_dtype) + ) + result = df.groupby("cat")["val"].mean(skipna=skipna) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "values, dtype", + [ + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Int64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "timedelta64[ns]"), + ], +) +def test_sum_skipna(values, dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the original dtype because + # Series.sum() changes the dtype + expected = ( + df.groupby("cat")["val"].apply(lambda x: x.sum(skipna=skipna)).astype(dtype) + ) + result = df.groupby("cat")["val"].sum(skipna=skipna) + tm.assert_series_equal(result, expected) + + +def test_sum_skipna_object(skipna): + # GH#15675 + df = DataFrame( + { + "val": ["a", "b", np.nan, "d", "e", "f", "g", "h", "i", "j"], + "cat": ["A", "B"] * 5, + } + ).astype({"val": object}) + if skipna: + expected = Series( + ["aegi", "bdfhj"], index=pd.Index(["A", "B"], name="cat"), name="val" + ).astype(object) + else: + expected = Series( + [np.nan, "bdfhj"], index=pd.Index(["A", "B"], name="cat"), name="val" + ).astype(object) + result = df.groupby("cat")["val"].sum(skipna=skipna) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "func, values, dtype, result_dtype", + [ + ("prod", [0, 1, 3, np.nan, 4, 5, 6, 7, -8, 9], "float64", "float64"), + ("prod", [0, -1, 3, 4, 5, np.nan, 6, 7, 8, 9], "Float64", "Float64"), + ("prod", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Int64", "Int64"), + ("prod", [np.nan] * 10, "float64", "float64"), + ("prod", [np.nan] * 10, "Float64", "Float64"), + ("prod", [np.nan] * 10, "Int64", "Int64"), + ("var", [0, -1, 3, 4, np.nan, 5, 6, 7, 8, 9], "float64", "float64"), + ("var", [0, 1, 3, -4, 5, 6, 7, -8, 9, np.nan], "Float64", "Float64"), + ("var", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "Int64", "Float64"), + ("var", [np.nan] * 10, "float64", "float64"), + ("var", [np.nan] * 10, "Float64", "Float64"), + ("var", [np.nan] * 10, "Int64", "Float64"), + ("std", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "float64", "float64"), + ("std", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "Float64", "Float64"), + ("std", [0, 1, 3, -4, 5, 6, 7, -8, 9, np.nan], "Int64", "Float64"), + ("std", [np.nan] * 10, "float64", "float64"), + ("std", [np.nan] * 10, "Float64", "Float64"), + ("std", [np.nan] * 10, "Int64", "Float64"), + ("sem", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("sem", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("sem", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Float64"), + ("sem", [np.nan] * 10, "float64", "float64"), + ("sem", [np.nan] * 10, "Float64", "Float64"), + ("sem", [np.nan] * 10, "Int64", "Float64"), + ("min", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("min", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("min", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Int64"), + ( + "min", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "min", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("min", [np.nan] * 10, "float64", "float64"), + ("min", [np.nan] * 10, "Float64", "Float64"), + ("min", [np.nan] * 10, "Int64", "Int64"), + ("max", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("max", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("max", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Int64"), + ( + "max", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "max", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("max", [np.nan] * 10, "float64", "float64"), + ("max", [np.nan] * 10, "Float64", "Float64"), + ("max", [np.nan] * 10, "Int64", "Int64"), + ("median", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("median", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("median", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Float64"), + ( + "median", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "median", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("median", [np.nan] * 10, "float64", "float64"), + ("median", [np.nan] * 10, "Float64", "Float64"), + ("median", [np.nan] * 10, "Int64", "Float64"), + ], +) +def test_multifunc_skipna(func, values, dtype, result_dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the result_dtype as some operations + # change the dtype + expected = ( + df.groupby("cat")["val"] + .apply(lambda x: getattr(x, func)(skipna=skipna)) + .astype(result_dtype) + ) + result = getattr(df.groupby("cat")["val"], func)(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cython_median(): arr = np.random.default_rng(2).standard_normal(1000) arr[::2] = np.nan @@ -470,8 +701,7 @@ def test_max_min_non_numeric(): assert "ss" in result -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_max_min_object_multiple_columns(): +def test_max_min_object_multiple_columns(using_infer_string): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with # DataFrame._reduce @@ -484,7 +714,7 @@ def test_max_min_object_multiple_columns(): } ) df._consolidate_inplace() # should already be consolidate, but double-check - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 3 if using_infer_string else 2 gb = df.groupby("A") @@ -1117,6 +1347,7 @@ def test_apply_to_nullable_integer_returns_float(values, function): "median", "mean", "skew", + "kurt", "std", "var", "sem", @@ -1130,8 +1361,8 @@ def test_regression_allowlist_methods(op, skipna, sort): grouped = frame.groupby(level=0, sort=sort) - if op == "skew": - # skew has skipna + if op in ["skew", "kurt", "sum", "mean"]: + # skew, kurt, sum, mean have skipna result = getattr(grouped, op)(skipna=skipna) expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(skipna=skipna)) if sort: diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index ee4973cbf18af..550efe9187fe8 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -76,6 +76,8 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: + # TODO(infer_string) resample sum introduces 0's + # https://github.com/pandas-dev/pandas/issues/60229 @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_timegrouper(self): # GH 4161 @@ -481,12 +483,8 @@ def test_timegrouper_apply_return_type_series(self): def sumfunc_series(x): return Series([x["value"].sum()], ("sum",)) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) + expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -502,11 +500,8 @@ def test_timegrouper_apply_return_type_value(self): def sumfunc_value(x): return x.value.sum() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) + expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -932,9 +927,7 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze( assert gb._selected_obj.index.nlevels == 1 # function that returns a Series - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res = gb.apply(lambda x: x["Quantity"] * 2) + res = gb.apply(lambda x: x["Quantity"] * 2) dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date") expected = DataFrame( diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 5b8fa96291c9f..fecd20fd6cece 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.core.dtypes.common import ensure_platform_int @@ -331,9 +329,6 @@ def test_transform_transformation_func(transformation_func): if transformation_func == "cumcount": test_op = lambda x: x.transform("cumcount") mock_op = lambda x: Series(range(len(x)), x.index) - elif transformation_func == "fillna": - test_op = lambda x: x.transform("fillna", value=0) - mock_op = lambda x: x.fillna(value=0) elif transformation_func == "ngroup": test_op = lambda x: x.transform("ngroup") counter = -1 @@ -536,15 +531,13 @@ def f(group): return group[:1] grouped = df.groupby("c") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(f) + result = grouped.apply(f) assert result["d"].dtype == np.float64 # this is by definition a mutating operation! for key, group in grouped: - res = f(group) + res = f(group.drop(columns="c")) tm.assert_frame_equal(res, result.loc[key]) @@ -690,18 +683,14 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): f = gb[["float", "float_missing"]].apply(targop) expected = concat([f, i], axis=1) else: - if op != "shift" or not isinstance(gb_target.get("by"), (str, list)): - warn = None - else: - warn = DeprecationWarning - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - expected = gb.apply(targop) + expected = gb.apply(targop) expected = expected.sort_index(axis=1) if op == "shift": expected["string_missing"] = expected["string_missing"].fillna(np.nan) - expected["string"] = expected["string"].fillna(np.nan) + by = gb_target.get("by") + if not isinstance(by, (str, list)) or (by != "string" and "string" not in by): + expected["string"] = expected["string"].fillna(np.nan) result = gb[expected.columns].transform(op, *args).sort_index(axis=1) tm.assert_frame_equal(result, expected) @@ -1034,20 +1023,19 @@ def test_groupby_transform_with_datetimes(func, values): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_transform_dtype(): # GH 22243 df = DataFrame({"a": [1], "val": [1.35]}) result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}")) - expected1 = Series(["+1.35"], name="val", dtype="object") + expected1 = Series(["+1.35"], name="val") tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}")) tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})")) - expected2 = Series(["+(1.35)"], name="val", dtype="object") + expected2 = Series(["+(1.35)"], name="val") tm.assert_series_equal(result, expected2) df["val"] = df["val"].astype(object) @@ -1100,13 +1088,13 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): func = reduction_func obj = DataFrame( - {"a": [0, 0, 0, 1, 1, 1], "b": range(6)}, - index=["A", "B", "C", "D", "E", "F"], + {"a": [0, 0, 0, 0, 1, 1, 1, 1], "b": range(8)}, + index=["A", "B", "C", "D", "E", "F", "G", "H"], ) if frame_or_series is Series: obj = obj["a"] - g = obj.groupby(np.repeat([0, 1], 3)) + g = obj.groupby(np.repeat([0, 1], 4)) if func == "corrwith" and isinstance(obj, Series): # GH#32293 # TODO: implement SeriesGroupBy.corrwith @@ -1131,7 +1119,7 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): tm.assert_index_equal(result.columns, obj.columns) # verify that values were broadcasted across each group - assert len(set(DataFrame(result).iloc[-3:, -1])) == 1 + assert len(set(DataFrame(result).iloc[-4:, -1])) == 1 def test_transform_lambda_with_datetimetz(): @@ -1439,11 +1427,7 @@ def test_null_group_str_transformer_series(dropna, transformation_func): dtype = object if transformation_func in ("any", "all") else None buffer.append(Series([np.nan], index=[3], dtype=dtype)) expected = concat(buffer) - - warn = FutureWarning if transformation_func == "fillna" else None - msg = "SeriesGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - result = gb.transform(transformation_func, *args) + result = gb.transform(transformation_func, *args) tm.assert_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 59c555b9644a1..dde5f38074efb 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -186,6 +186,12 @@ def test_subtype_datetimelike(self, index, subtype): with pytest.raises(TypeError, match=msg): index.astype(dtype) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_category(self, index): + super().test_astype_category(index) + class TestDatetimelikeSubtype(AstypeTests): """Tests specific to IntervalIndex with datetime-like subtype""" diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index f858ae137ca4e..73bbfc91028b3 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -59,6 +59,9 @@ def test_repr_floats(self): expected = "(329.973, 345.137] 1\n(345.137, 360.191] 2\ndtype: int64" assert result == expected + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) @pytest.mark.parametrize( "tuples, closed, expected_data", [ diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 787461b944bd0..5783a16e81d37 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -340,6 +340,9 @@ def test_get_indexer_categorical(self, target, ordered): expected = index.get_indexer(target) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_get_indexer_categorical_with_nans(self): # GH#41934 nans in both index and in target ii = IntervalIndex.from_breaks(range(5)) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index f6b10c989326f..347d6b206e3b9 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + import pandas as pd from pandas import ( DataFrame, @@ -16,6 +18,40 @@ def test_to_numpy(idx): tm.assert_numpy_array_equal(result, exp) +def test_array_interface(idx): + # https://github.com/pandas-dev/pandas/pull/60046 + result = np.asarray(idx) + expected = np.empty((6,), dtype=object) + expected[:] = [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ] + tm.assert_numpy_array_equal(result, expected) + + # it always gives a copy by default, but the values are cached, so results + # are still sharing memory + result_copy1 = np.asarray(idx) + result_copy2 = np.asarray(idx) + assert np.may_share_memory(result_copy1, result_copy2) + + # with explicit copy=True, then it is an actual copy + result_copy1 = np.array(idx, copy=True) + result_copy2 = np.array(idx, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + # for MultiIndex, copy=False is never allowed + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + np.array(idx, copy=False) + + def test_to_frame(): tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index e85091aaae608..f7544cf62e5fa 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( CategoricalIndex, @@ -754,13 +752,12 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype): tm.assert_index_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_union_with_na_when_constructing_dataframe(): # GH43222 series1 = Series( (1,), index=MultiIndex.from_arrays( - [Series([None], dtype="string"), Series([None], dtype="string")] + [Series([None], dtype="str"), Series([None], dtype="str")] ), ) series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b")))) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 89648bc316c16..2c5968314e5cf 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -62,6 +62,15 @@ def test_get_indexer_with_NA_values( expected = np.array([0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_infer_string_missing_values(self): + # ensure the passed list is not cast to string but to object so that + # the None value is matched in the index + # https://github.com/pandas-dev/pandas/issues/55834 + idx = Index(["a", "b", None], dtype="object") + result = idx.get_indexer([None, "x"]) + expected = np.array([2, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestGetIndexerNonUnique: def test_get_indexer_non_unique_nas(self, nulls_fixture): diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index 755b7109a5a04..648ee47ddc34c 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -6,6 +6,53 @@ import pandas._testing as tm +def _isnan(val): + try: + return val is not pd.NA and np.isnan(val) + except TypeError: + return False + + +def _equivalent_na(dtype, null): + if dtype.na_value is pd.NA and null is pd.NA: + return True + elif _isnan(dtype.na_value) and _isnan(null): + return True + else: + return False + + +class TestGetLoc: + def test_get_loc(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + assert index.get_loc("b") == 1 + + def test_get_loc_raises(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="d"): + index.get_loc("d") + + def test_get_loc_invalid_value(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="1"): + index.get_loc(1) + + def test_get_loc_non_unique(self, any_string_dtype): + index = Index(["a", "b", "a"], dtype=any_string_dtype) + result = index.get_loc("a") + expected = np.array([True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + + def test_get_loc_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) + assert index.get_loc(nulls_fixture) == 2 + + class TestGetIndexer: @pytest.mark.parametrize( "method,expected", @@ -41,23 +88,57 @@ def test_get_indexer_strings_raises(self, any_string_dtype): ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] ) + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): + # NaT and Decimal("NaN") from null_fixture are not supported for string dtype + index = Index(["a", "b", null], dtype=any_string_dtype) + result = index.get_indexer(["a", null, "c"]) + if using_infer_string: + expected = np.array([0, 2, -1], dtype=np.intp) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null + ): + expected = np.array([0, -1, -1], dtype=np.intp) + else: + expected = np.array([0, 2, -1], dtype=np.intp) -class TestGetIndexerNonUnique: - @pytest.mark.xfail(reason="TODO(infer_string)", strict=False) - def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture): - index = Index(["a", "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + tm.assert_numpy_array_equal(result, expected) - expected_indexer = np.array([2], dtype=np.intp) - expected_missing = np.array([], dtype=np.intp) + +class TestGetIndexerNonUnique: + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_non_unique_nas( + self, any_string_dtype, null, using_infer_string + ): + index = Index(["a", "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null + ): + expected_indexer = np.array([0, -1], dtype=np.intp) + expected_missing = np.array([1], dtype=np.intp) + else: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", None, "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) - - expected_indexer = np.array([1, 3], dtype=np.intp) + index = Index(["a", null, "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null + ): + pass + else: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 19b46d9b2c15f..608158d40cf23 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -8,12 +8,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import ( - HAS_PYARROW, - IS64, -) +from pandas.compat import IS64 from pandas.errors import InvalidIndexError import pandas.util._test_decorators as td @@ -356,14 +351,11 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") - elif index.dtype == "str" and not index.dtype.storage == "python": - # TODO(infer_string): Make the errors consistent - with pytest.raises(NotImplementedError, match="i8"): - index.view("i8") else: msg = ( - "Cannot change data-type for array of references.|" - "Cannot change data-type for object array.|" + r"Cannot change data-type for array of references\.|" + r"Cannot change data-type for object array\.|" + r"Cannot change data-type for array of strings\.|" ) with pytest.raises(TypeError, match=msg): index.view("i8") @@ -823,11 +815,6 @@ def test_isin(self, values, index, expected): expected = np.array(expected, dtype=bool) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_isin_nan_common_object( self, nulls_fixture, nulls_fixture2, using_infer_string ): diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 5f934ca3e6e83..58b69d79c65ce 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -525,6 +525,7 @@ def test_intersection_difference_match_empty(self, index, sort): tm.assert_index_equal(inter, diff, exact=True) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 36b08ee1df790..7aeded5a6cb7f 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1230,7 +1230,7 @@ def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string # assigning with loc/iloc attempts to set the values inplace, which # in this case is successful if using_infer_string: - with pytest.raises(TypeError, match="Must provide strings"): + with pytest.raises(TypeError, match="Invalid value"): result.loc[result.index, "A"] = [float(x) for x in col_data] else: result.loc[result.index, "A"] = [float(x) for x in col_data] @@ -3297,3 +3297,23 @@ def test_loc_reindexing_of_empty_index(self): df.loc[Series([False] * 4, index=df.index, name=0), 0] = df[0] expected = DataFrame(index=[1, 1, 2, 2], data=["1", "1", "2", "2"]) tm.assert_frame_equal(df, expected) + + def test_loc_setitem_matching_index(self): + # GH 25548 + s = Series(0.0, index=list("abcd")) + s1 = Series(1.0, index=list("ab")) + s2 = Series(2.0, index=list("xy")) + + # Test matching indices + s.loc[["a", "b"]] = s1 + + result = s[["a", "b"]] + expected = s1 + tm.assert_series_equal(result, expected) + + # Test unmatched indices + s.loc[["a", "b"]] = s2 + + result = s[["a", "b"]] + expected = Series([np.nan, np.nan], index=["a", "b"]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 29ce9d0c03111..a41d7dec8b496 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import iNaT from pandas.compat import ( is_ci_environment, @@ -280,7 +278,7 @@ def test_empty_pyarrow(data): expected = pd.DataFrame(data) arrow_df = pa_from_dataframe(expected) result = from_dataframe(arrow_df) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_column_type=False) def test_multi_chunk_pyarrow() -> None: @@ -290,8 +288,7 @@ def test_multi_chunk_pyarrow() -> None: table = pa.table([n_legs], names=names) with pytest.raises( RuntimeError, - match="To join chunks a copy is required which is " - "forbidden by allow_copy=False", + match="Cannot do zero copy conversion into multi-column DataFrame block", ): pd.api.interchange.from_dataframe(table, allow_copy=False) @@ -401,7 +398,6 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: pd.api.interchange.from_dataframe(df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_empty_string_column(): # https://github.com/pandas-dev/pandas/issues/56703 df = pd.DataFrame({"a": []}, dtype=str) @@ -410,13 +406,12 @@ def test_empty_string_column(): tm.assert_frame_equal(df, result) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_large_string(): # GH#56702 pytest.importorskip("pyarrow") df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") result = pd.api.interchange.from_dataframe(df.__dataframe__()) - expected = pd.DataFrame({"a": ["x"]}, dtype="object") + expected = pd.DataFrame({"a": ["x"]}, dtype="str") tm.assert_frame_equal(result, expected) @@ -427,7 +422,6 @@ def test_non_str_names(): assert names == ["0"] -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_non_str_names_w_duplicates(): # https://github.com/pandas-dev/pandas/issues/56701 df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) @@ -438,7 +432,7 @@ def test_non_str_names_w_duplicates(): "Expected a Series, got a DataFrame. This likely happened because you " "called __dataframe__ on a DataFrame which, after converting column " r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " - r"dtype='object'\). Please rename these columns before using the " + r"dtype='(str|object)'\). Please rename these columns before using the " "interchange protocol." ), ): @@ -646,3 +640,12 @@ def test_buffer_dtype_categorical( col = dfi.get_column_by_name("data") assert col.dtype == expected_dtype assert col.get_buffers()["data"][1] == expected_buffer_dtype + + +def test_from_dataframe_list_dtype(): + pa = pytest.importorskip("pyarrow", "14.0.0") + data = {"a": [[1, 2], [4, 5, 6]]} + tbl = pa.table(data) + result = from_dataframe(tbl) + expected = pd.DataFrame(data) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index 591157bbe87fe..fc222f6987466 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -41,6 +41,20 @@ def test_namespace(): assert set(result) == set(expected + modules) +@pytest.mark.parametrize( + "name", + [ + "Block", + "ExtensionBlock", + ], +) +def test_deprecations(name): + # GH#55139 + msg = f"{name} is deprecated.* Use public APIs instead" + with tm.assert_produces_warning(FutureWarning, match=msg): + getattr(internals, name) + + def test_make_block_2d_with_dti(): # GH#41168 dti = pd.date_range("2012", periods=3, tz="UTC") @@ -53,6 +67,18 @@ def test_make_block_2d_with_dti(): assert blk.values.shape == (1, 3) +def test_create_block_manager_from_blocks_deprecated(): + # GH#33892 + # If they must, downstream packages should get this from internals.api, + # not internals. + msg = ( + "create_block_manager_from_blocks is deprecated and will be " + "removed in a future version. Use public APIs instead" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + internals.create_block_manager_from_blocks + + def test_create_dataframe_from_blocks(float_frame): block = float_frame._mgr.blocks[0] index = float_frame.index.copy() diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index bdefadf3dbec0..a5ddda9d66e7a 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -67,14 +67,13 @@ def s3_base(worker_id, monkeypatch): monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") if is_ci_environment(): if is_platform_arm() or is_platform_mac() or is_platform_windows(): - # NOT RUN on Windows/macOS/ARM, only Ubuntu + # NOT RUN on Windows/macOS, only Ubuntu # - subprocess in CI can cause timeouts # - GitHub Actions do not support # container services for the above OSs - # - CircleCI will probably hit the Docker rate pull limit pytest.skip( - "S3 tests do not have a corresponding service in " - "Windows, macOS or ARM platforms" + "S3 tests do not have a corresponding service on " + "Windows or macOS platforms" ) else: # set in .github/workflows/unit-tests.yml diff --git a/pandas/tests/io/data/tar/test-csv.tar b/pandas/tests/io/data/tar/test-csv.tar new file mode 100644 index 0000000000000..c3b3091348426 Binary files /dev/null and b/pandas/tests/io/data/tar/test-csv.tar differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 3989e022dbbd2..34824f0a67985 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -17,8 +17,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -625,7 +623,6 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): expected = DataFrame(expected) tm.assert_frame_equal(actual, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(self, read_ext, dtype_backend, engine, tmp_excel): # GH#36712 if read_ext in (".xlsb", ".xls"): diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index f70e65e34c584..71ef1201e523f 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -9,6 +9,9 @@ from pandas import ( DataFrame, + MultiIndex, + Timestamp, + period_range, read_excel, ) import pandas._testing as tm @@ -333,3 +336,26 @@ def test_styler_to_s3(s3_public_bucket, s3so): f"s3://{mock_bucket_name}/{target_file}", index_col=0, storage_options=s3so ) tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("merge_cells", [True, False, "columns"]) +def test_format_hierarchical_rows_periodindex(merge_cells): + # GH#60099 + df = DataFrame( + {"A": [1, 2]}, + index=MultiIndex.from_arrays( + [ + period_range(start="2006-10-06", end="2006-10-07", freq="D"), + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + formatter = ExcelFormatter(df, merge_cells=merge_cells) + formatted_cells = formatter._format_hierarchical_rows() + + for cell in formatted_cells: + if cell.row != 0 and cell.col == 0: + assert isinstance( + cell.val, Timestamp + ), "Period should be converted to Timestamp" diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 44266ae9a62a5..ced4feb9e7eb9 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -3,6 +3,7 @@ datetime, timedelta, ) +from decimal import Decimal from functools import partial from io import BytesIO import os @@ -12,8 +13,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -24,6 +23,7 @@ MultiIndex, date_range, option_context, + period_range, ) import pandas._testing as tm @@ -336,6 +336,43 @@ def test_multiindex_interval_datetimes(self, tmp_excel): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("merge_cells", [True, False, "columns"]) + def test_excel_round_trip_with_periodindex(self, tmp_excel, merge_cells): + # GH#60099 + df = DataFrame( + {"A": [1, 2]}, + index=MultiIndex.from_arrays( + [ + period_range(start="2006-10-06", end="2006-10-07", freq="D"), + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + df.to_excel(tmp_excel, merge_cells=merge_cells) + result = pd.read_excel(tmp_excel, index_col=[0, 1]) + expected = DataFrame( + {"A": [1, 2]}, + MultiIndex.from_arrays( + [ + [ + pd.to_datetime("2006-10-06 00:00:00"), + pd.to_datetime("2006-10-07 00:00:00"), + ], + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + time_format = ( + "datetime64[s]" if tmp_excel.endswith(".ods") else "datetime64[us]" + ) + expected.index = expected.index.set_levels( + expected.index.levels[0].astype(time_format), level=0 + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "engine,ext", @@ -763,6 +800,9 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_excel_interval_no_labels(self, tmp_excel, using_infer_string): # see gh-19242 # @@ -869,27 +909,49 @@ def test_to_excel_multiindex_nan_label(self, merge_cells, tmp_excel): # Test for Issue 11328. If column indices are integers, make # sure they are handled correctly for either setting of # merge_cells - def test_to_excel_multiindex_cols(self, merge_cells, frame, tmp_excel): + def test_to_excel_multiindex_cols(self, merge_cells, tmp_excel): + # GH#11328 + frame = DataFrame( + { + "A": [1, 2, 3], + "B": [4, 5, 6], + "C": [7, 8, 9], + } + ) arrays = np.arange(len(frame.index) * 2, dtype=np.int64).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), (50, 1), (50, 2)]) + new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), (50, 1)]) frame.columns = new_cols_index - header = [0, 1] - if not merge_cells: - header = 0 - - # round trip frame.to_excel(tmp_excel, sheet_name="test1", merge_cells=merge_cells) + + # Check round trip with ExcelFile(tmp_excel) as reader: - df = pd.read_excel( - reader, sheet_name="test1", header=header, index_col=[0, 1] + result = pd.read_excel( + reader, sheet_name="test1", header=[0, 1], index_col=[0, 1] ) + tm.assert_frame_equal(result, frame) + + # GH#60274 + # Check with header/index_col None to determine which cells were merged + with ExcelFile(tmp_excel) as reader: + result = pd.read_excel( + reader, sheet_name="test1", header=None, index_col=None + ) + expected = DataFrame( + { + 0: [np.nan, np.nan, "first", 0, 1, 2], + 1: [np.nan, np.nan, "second", 3, 4, 5], + 2: [40.0, 1.0, np.nan, 1.0, 2.0, 3.0], + 3: [np.nan, 2.0, np.nan, 4.0, 5.0, 6.0], + 4: [50.0, 1.0, np.nan, 7.0, 8.0, 9.0], + } + ) if not merge_cells: - fm = frame.columns._format_multi(sparsify=False, include_names=False) - frame.columns = [".".join(map(str, q)) for q in zip(*fm)] - tm.assert_frame_equal(frame, df) + # MultiIndex column value is repeated + expected.loc[0, 3] = 40.0 + tm.assert_frame_equal(result, expected) def test_to_excel_multiindex_dates(self, merge_cells, tmp_excel): # try multiindex with dates @@ -977,6 +1039,36 @@ def test_to_excel_float_format(self, tmp_excel): ) tm.assert_frame_equal(result, expected) + def test_to_excel_datatypes_preserved(self, tmp_excel): + # Test that when writing and reading Excel with dtype=object, + # datatypes are preserved, except Decimals which should be + # stored as floats + + # see gh-49598 + df = DataFrame( + [ + [1.23, "1.23", Decimal("1.23")], + [4.56, "4.56", Decimal("4.56")], + ], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + df.to_excel(tmp_excel) + + with ExcelFile(tmp_excel) as reader: + result = pd.read_excel(reader, index_col=0, dtype=object) + + expected = DataFrame( + [ + [1.23, "1.23", 1.23], + [4.56, "4.56", 4.56], + ], + index=["A", "B"], + columns=["X", "Y", "Z"], + dtype=object, + ) + tm.assert_frame_equal(result, expected) + def test_to_excel_output_encoding(self, tmp_excel): # Avoid mixed inferred_type. df = DataFrame( @@ -1334,12 +1426,11 @@ def test_freeze_panes(self, tmp_excel): result = pd.read_excel(tmp_excel, index_col=0) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_path_lib(self, engine, ext): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD")), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(30)]), ) writer = partial(df.to_excel, engine=engine) diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index e9fc2b2d27afd..ff8a1b9f570ab 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -488,9 +488,11 @@ def test_repr_html_ok(self, styler): def test_repr_html_mathjax(self, styler): # gh-19824 / 41395 assert "tex2jax_ignore" not in styler._repr_html_() + assert "mathjax_ignore" not in styler._repr_html_() with option_context("styler.html.mathjax", False): assert "tex2jax_ignore" in styler._repr_html_() + assert "mathjax_ignore" in styler._repr_html_() def test_update_ctx(self, styler): styler._update_ctx(DataFrame({"A": ["color: red", "color: blue"]})) diff --git a/pandas/tests/io/formats/style/test_to_latex.py b/pandas/tests/io/formats/style/test_to_latex.py index 1abe6238d3922..eb221686dd165 100644 --- a/pandas/tests/io/formats/style/test_to_latex.py +++ b/pandas/tests/io/formats/style/test_to_latex.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, MultiIndex, @@ -731,7 +729,6 @@ def test_longtable_caption_label(styler, caption, cap_exp, label, lab_exp): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize( "columns, siunitx", diff --git a/pandas/tests/io/formats/style/test_to_typst.py b/pandas/tests/io/formats/style/test_to_typst.py new file mode 100644 index 0000000000000..2365119c9c4dc --- /dev/null +++ b/pandas/tests/io/formats/style/test_to_typst.py @@ -0,0 +1,96 @@ +from textwrap import dedent + +import pytest + +from pandas import ( + DataFrame, + Series, +) + +pytest.importorskip("jinja2") +from pandas.io.formats.style import Styler + + +@pytest.fixture +def df(): + return DataFrame( + {"A": [0, 1], "B": [-0.61, -1.22], "C": Series(["ab", "cd"], dtype=object)} + ) + + +@pytest.fixture +def styler(df): + return Styler(df, uuid_len=0, precision=2) + + +def test_basic_table(styler): + result = styler.to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + )""" + ) + assert result == expected + + +def test_concat(styler): + result = styler.concat(styler.data.agg(["sum"]).style).to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + [sum], [1], [-1.830000], [abcd], + )""" + ) + assert result == expected + + +def test_concat_recursion(styler): + df = styler.data + styler1 = styler + styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3) + styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4) + result = styler1.concat(styler2.concat(styler3)).to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + [sum], [1], [-1.830], [abcd], + [sum], [1], [-1.8300], [abcd], + )""" + ) + assert result == expected + + +def test_concat_chain(styler): + df = styler.data + styler1 = styler + styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3) + styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4) + result = styler1.concat(styler2).concat(styler3).to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + [sum], [1], [-1.830], [abcd], + [sum], [1], [-1.8300], [abcd], + )""" + ) + assert result == expected diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 0dc16e1ebc723..86682e8160762 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -129,6 +129,20 @@ def test_repr_truncation_preserves_na(self): with option_context("display.max_rows", 2, "display.show_dimensions", False): assert repr(df) == " a\n0 \n.. ...\n9 " + def test_repr_truncation_dataframe_attrs(self): + # GH#60455 + df = DataFrame([[0] * 10]) + df.attrs["b"] = DataFrame([]) + with option_context("display.max_columns", 2, "display.show_dimensions", False): + assert repr(df) == " 0 ... 9\n0 0 ... 0" + + def test_repr_truncation_series_with_dataframe_attrs(self): + # GH#60568 + ser = Series([0] * 10) + ser.attrs["b"] = DataFrame([]) + with option_context("display.max_rows", 2, "display.show_dimensions", False): + assert repr(ser) == "0 0\n ..\n9 0\ndtype: int64" + def test_max_colwidth_negative_int_raises(self): # Deprecation enforced from: # https://github.com/pandas-dev/pandas/issues/31532 diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 1009dfec53218..3b63011bf862e 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -3,11 +3,33 @@ from collections.abc import Mapping import string +import pytest + import pandas._config.config as cf +import pandas as pd + from pandas.io.formats import printing +@pytest.mark.parametrize( + "input_names, expected_names", + [ + (["'a b"], "['\\'a b']"), # Escape leading quote + (["test's b"], "['test\\'s b']"), # Escape apostrophe + (["'test' b"], "['\\'test\\' b']"), # Escape surrounding quotes + (["test b'"], "['test b\\'']"), # Escape single quote + (["test\n' b"], "['test\\n\\' b']"), # Escape quotes, preserve newline + ], +) +def test_formatted_index_names(input_names, expected_names): + # GH#60190 + df = pd.DataFrame({name: [1, 2, 3] for name in input_names}).set_index(input_names) + formatted_names = str(df.index.names) + + assert formatted_names == expected_names + + def test_adjoin(): data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] expected = "a dd ggg\nb ee hhh\nc ff iii" diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 8031f67cd0567..b1a437bfdbd8a 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -934,9 +934,11 @@ def test_repr_html(self, float_frame): def test_repr_html_mathjax(self): df = DataFrame([[1, 2], [3, 4]]) assert "tex2jax_ignore" not in df._repr_html_() + assert "mathjax_ignore" not in df._repr_html_() with option_context("display.html.use_mathjax", False): assert "tex2jax_ignore" in df._repr_html_() + assert "mathjax_ignore" in df._repr_html_() def test_repr_html_wide(self): max_cols = 20 diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 1de53993fe646..8d46442611719 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -1405,3 +1405,88 @@ def test_to_latex_multiindex_multirow(self): """ ) assert result == expected + + def test_to_latex_multiindex_format_single_index_hidden(self): + # GH 52218 + df = DataFrame( + { + "A": [1, 2], + "B": [4, 5], + } + ) + result = ( + df.style.hide(axis="index") + .map_index(lambda v: "textbf:--rwrap;", axis="columns") + .to_latex() + ) + expected = _dedent(r""" + \begin{tabular}{rr} + \textbf{A} & \textbf{B} \\ + 1 & 4 \\ + 2 & 5 \\ + \end{tabular} + """) + assert result == expected + + def test_to_latex_multiindex_format_triple_index_two_hidden(self): + # GH 52218 + arrays = [ + ["A", "A", "B", "B"], + ["one", "two", "one", "two"], + ["x", "x", "y", "y"], + ] + index = pd.MultiIndex.from_arrays( + arrays, names=["Level 0", "Level 1", "Level 2"] + ) + df = DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], + index=index, + columns=["C1", "C2", "C3"], + ) + result = ( + df.style.hide(axis="index", level=[0, 1]) + .map_index(lambda v: "textbf:--rwrap;", axis="columns") + .to_latex() + ) + expected = _dedent(r""" + \begin{tabular}{lrrr} + & \textbf{C1} & \textbf{C2} & \textbf{C3} \\ + Level 2 & & & \\ + x & 0 & 0 & 0 \\ + x & 0 & 0 & 0 \\ + y & 0 & 0 & 0 \\ + y & 0 & 0 & 0 \\ + \end{tabular} + """) + assert result == expected + + def test_to_latex_multiindex_format_triple_index_all_hidden(self): + # GH 52218 + arrays = [ + ["A", "A", "B", "B"], + ["one", "two", "one", "two"], + ["x", "x", "y", "y"], + ] + index = pd.MultiIndex.from_arrays( + arrays, names=["Level 0", "Level 1", "Level 2"] + ) + df = DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], + index=index, + columns=["C1", "C2", "C3"], + ) + result = ( + df.style.hide(axis="index", level=[0, 1, 2]) + .map_index(lambda v: "textbf:--rwrap;", axis="columns") + .to_latex() + ) + expected = _dedent(r""" + \begin{tabular}{rrr} + \textbf{C1} & \textbf{C2} & \textbf{C3} \\ + 0 & 0 & 0 \\ + 0 & 0 & 0 \\ + 0 & 0 & 0 \\ + 0 & 0 & 0 \\ + \end{tabular} + """) + assert result == expected diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 5731f74a03852..1e8598c918efe 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -132,20 +132,17 @@ def test_to_string_with_formatters_unicode(self): ) assert result == expected - def test_to_string_index_formatter(self): - df = DataFrame([range(5), range(5, 10), range(10, 15)]) - - rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) - - xp = dedent( - """\ - 0 1 2 3 4 - a 0 1 2 3 4 - b 5 6 7 8 9 - c 10 11 12 13 14\ - """ - ) - assert rs == xp + def test_to_string_index_formatter(self): + df = DataFrame([range(5), range(5, 10), range(10, 15)]) + rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) + xp = dedent( + """\ + 0 1 2 3 4 + a 0 1 2 3 4 + b 5 6 7 8 9 + c 10 11 12 13 14""" + ) + assert rs == xp def test_no_extra_space(self): # GH#52690: Check that no extra space is given @@ -422,6 +419,24 @@ def test_to_string_complex_float_formatting(self): ) assert result == expected + def test_to_string_complex_float_formatting_with_exponents(self): + # GH #60393 + with option_context("display.precision", 6): + df = DataFrame( + { + "x": [ + (1.8816e-09 + 0j), + (1.8816e-09 + 3.39676e-09j), + ] + } + ) + result = df.to_string() + expected = ( + " x\n0 1.881600e-09+0.000000e+00j\n" + "1 1.881600e-09+3.396760e-09j" + ) + assert result == expected + def test_to_string_format_inf(self): # GH#24861 df = DataFrame( diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index ff7d34c85c015..953a9246da1cd 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -41,6 +41,7 @@ def test_read_zipped_json(datapath): @td.skip_if_not_us_locale @pytest.mark.single_cpu +@pytest.mark.network def test_with_s3_url(compression, s3_public_bucket, s3so): # Bucket created in tests/io/conftest.py df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}')) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 7f367ded39863..7936982e4a055 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -27,10 +25,6 @@ set_default_names, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - @pytest.fixture def df_schema(): @@ -127,7 +121,7 @@ def test_multiindex(self, df_schema, using_infer_string): expected["fields"][0] = { "name": "level_0", "type": "any", - "extDtype": "string", + "extDtype": "str", } expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index 8de289afe9ff9..12ae24b064c9d 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -159,7 +159,7 @@ def test_build_decimal_series(self, dc): expected = OrderedDict( [ ("schema", schema), - ("data", [OrderedDict([("id", 0), ("a", 10.0)])]), + ("data", [OrderedDict([("id", 0), ("a", "10")])]), ] ) @@ -245,7 +245,7 @@ def test_to_json(self, da, dc, sa, ia): [ ("idx", 0), ("A", "2021-10-10T00:00:00.000"), - ("B", 10.0), + ("B", "10"), ("C", "pandas"), ("D", 10), ] diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d3328d1dfcaef..5dc1272880c9b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,6 +1,5 @@ import datetime from datetime import timedelta -from decimal import Decimal from io import StringIO import json import os @@ -84,7 +83,7 @@ def datetime_frame(self): # since that doesn't round-trip, see GH#33711 df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=30, freq="B"), ) df.index = df.index._with_freq(None) @@ -184,7 +183,6 @@ def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame): assert_json_roundtrip_equal(result, expected, orient) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [False, np.int64]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame): @@ -270,7 +268,6 @@ def test_roundtrip_empty(self, orient, convert_axes): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame): # TODO: improve coverage with date_format parameter @@ -698,7 +695,6 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("dtype", [False, None]) def test_series_roundtrip_object(self, orient, dtype, object_series): data = StringIO(object_series.to_json(orient=orient)) @@ -710,6 +706,9 @@ def test_series_roundtrip_object(self, orient, dtype, object_series): if orient != "split": expected.name = None + if using_string_dtype(): + expected = expected.astype("str") + tm.assert_series_equal(result, expected) def test_series_roundtrip_empty(self, orient): @@ -808,7 +807,6 @@ def test_path(self, float_frame, int_frame, datetime_frame): df.to_json(path) read_json(path) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_axis_dates(self, datetime_series, datetime_frame): # frame json = StringIO(datetime_frame.to_json()) @@ -821,7 +819,6 @@ def test_axis_dates(self, datetime_series, datetime_frame): tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dates(self, datetime_series, datetime_frame): # frame df = datetime_frame @@ -912,7 +909,6 @@ def test_convert_dates_infer(self, infer_word): result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "date,date_unit", [ @@ -973,7 +969,6 @@ def test_date_format_series_raises(self, datetime_series): with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_date_unit(self, unit, datetime_frame): df = datetime_frame df["date"] = Timestamp("20130101 20:43:42").as_unit("ns") @@ -1114,7 +1109,6 @@ def test_round_trip_exception(self, datapath): res = res.fillna(np.nan) tm.assert_frame_equal(res, df) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.network @pytest.mark.single_cpu @pytest.mark.parametrize( @@ -1418,6 +1412,7 @@ def test_read_inline_jsonl(self): tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu + @pytest.mark.network @td.skip_if_not_us_locale def test_read_s3_jsonl(self, s3_public_bucket_with_data, s3so): # GH17200 @@ -1555,7 +1550,6 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]] ) @@ -2018,6 +2012,7 @@ def test_json_multiindex(self): assert result == expected @pytest.mark.single_cpu + @pytest.mark.network def test_to_s3(self, s3_public_bucket, s3so): # GH 28375 mock_bucket_name, target_file = s3_public_bucket.name, "test.json" @@ -2031,12 +2026,8 @@ def test_to_s3(self, s3_public_bucket, s3so): timeout -= 0.1 assert timeout > 0, "Timed out waiting for file to appear on moto" - def test_json_pandas_nulls(self, nulls_fixture, request): + def test_json_pandas_nulls(self, nulls_fixture): # GH 31615 - if isinstance(nulls_fixture, Decimal): - mark = pytest.mark.xfail(reason="not implemented") - request.applymarker(mark) - expected_warning = None msg = ( "The default 'epoch' date format is deprecated and will be removed " diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 62118f1c82ebb..c5ccc3b3f7184 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -57,56 +57,56 @@ def test_encode_decimal(self): sut = decimal.Decimal("1337.1337") encoded = ujson.ujson_dumps(sut, double_precision=15) decoded = ujson.ujson_loads(encoded) - assert decoded == 1337.1337 + assert decoded == "1337.1337" sut = decimal.Decimal("0.95") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "1.0" + assert encoded == '"0.95"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.95" sut = decimal.Decimal("0.94") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "0.9" + assert encoded == '"0.94"' decoded = ujson.ujson_loads(encoded) - assert decoded == 0.9 + assert decoded == "0.94" sut = decimal.Decimal("1.95") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "2.0" + assert encoded == '"1.95"' decoded = ujson.ujson_loads(encoded) - assert decoded == 2.0 + assert decoded == "1.95" sut = decimal.Decimal("-1.95") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "-2.0" + assert encoded == '"-1.95"' decoded = ujson.ujson_loads(encoded) - assert decoded == -2.0 + assert decoded == "-1.95" sut = decimal.Decimal("0.995") encoded = ujson.ujson_dumps(sut, double_precision=2) - assert encoded == "1.0" + assert encoded == '"0.995"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.995" sut = decimal.Decimal("0.9995") encoded = ujson.ujson_dumps(sut, double_precision=3) - assert encoded == "1.0" + assert encoded == '"0.9995"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.9995" sut = decimal.Decimal("0.99999999999999944") encoded = ujson.ujson_dumps(sut, double_precision=15) - assert encoded == "1.0" + assert encoded == '"0.99999999999999944"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.99999999999999944" @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii): diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index a6504473fb55f..65ad7273666e5 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import parsers as libparsers from pandas.errors import DtypeWarning @@ -231,8 +229,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_warn_if_chunks_have_mismatched_type(all_parsers): +def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string): warning_type = None parser = all_parsers size = 10000 @@ -260,8 +257,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): "Specify dtype option on import or set low_memory=False.", buf, ) - - assert df.a.dtype == object + if parser.engine == "c" and parser.low_memory: + assert df.a.dtype == object + elif using_infer_string: + assert df.a.dtype == "str" + else: + assert df.a.dtype == object @pytest.mark.parametrize("iterator", [True, False]) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 511db2c6a33d8..3680273f5e98a 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -15,6 +15,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.errors import ( EmptyDataError, ParserError, @@ -766,7 +767,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index d8b8f24abcedd..cef57318195ec 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -15,8 +15,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import WASM from pandas.errors import ( EmptyDataError, @@ -71,14 +69,13 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_path_lib(all_parsers): parser = all_parsers df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 54b59ac4e25ed..cfa8785b24bde 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -9,8 +9,6 @@ import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -88,9 +86,10 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) -def test_multi_index_no_level_names(all_parsers, index_col): +def test_multi_index_no_level_names( + request, all_parsers, index_col, using_infer_string +): data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index e02562ac8d93d..75b7cf0d42cb8 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserWarning import pandas as pd @@ -57,7 +55,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_per_column(all_parsers): parser = all_parsers @@ -71,7 +68,6 @@ def test_dtype_per_column(all_parsers): [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] ) expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 9226f265ca2b3..11a30a26f91ef 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -18,8 +18,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import WASM from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( @@ -184,8 +182,7 @@ def error(val: float, actual_val: Decimal) -> Decimal: assert max(precise_errors) <= max(normal_errors) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_usecols_dtypes(c_parser_only): +def test_usecols_dtypes(c_parser_only, using_infer_string): parser = c_parser_only data = """\ 1,2,3 @@ -210,8 +207,12 @@ def test_usecols_dtypes(c_parser_only): dtype={"b": int, "c": float}, ) - assert (result.dtypes == [object, int, float]).all() - assert (result2.dtypes == [object, float]).all() + if using_infer_string: + assert (result.dtypes == ["string", int, float]).all() + assert (result2.dtypes == ["string", float]).all() + else: + assert (result.dtypes == [object, int, float]).all() + assert (result2.dtypes == [object, float]).all() def test_disable_bool_parsing(c_parser_only): diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 0423327c7333c..c6ba2213033ea 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -188,7 +186,6 @@ def convert_score(x): tm.assert_frame_equal(results[0], results[1]) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("conv_f", [lambda x: x, str]) def test_converter_index_col_bug(all_parsers, conv_f): # see gh-1835 , GH#40589 @@ -207,7 +204,7 @@ def test_converter_index_col_bug(all_parsers, conv_f): StringIO(data), sep=";", index_col="A", converters={"A": conv_f} ) - xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object")) + xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A")) tm.assert_frame_equal(rs, xp) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index ce2ed5e9764bd..9977e2b8e1a1d 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -345,7 +343,6 @@ def test_infer_types_boolean_sum(all_parsers): tm.assert_frame_equal(result, expected, check_index_type=False) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)]) def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): # GH#9435 @@ -356,7 +353,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine") ) result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) - expected = DataFrame({"b": [2]}, index=Index([val], name="a")) + expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 6a2ae3bffdc74..d3789cd387c05 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -8,9 +8,10 @@ import pytest -from pandas._config import using_string_dtype - -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -121,7 +122,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.columns are different def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 @@ -133,7 +133,7 @@ def test_mangled_unnamed_placeholders(all_parsers): # This test recursively updates `df`. for i in range(3): - expected = DataFrame() + expected = DataFrame(columns=Index([], dtype="str")) for j in range(i + 1): col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 89645b526f2ee..3a68d38cc0bde 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.parsers import STR_NA_VALUES from pandas import ( @@ -261,7 +259,6 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "kwargs,expected", [ @@ -299,7 +296,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): ), ], ) -def test_na_values_keep_default(all_parsers, kwargs, expected, request): +def test_na_values_keep_default( + all_parsers, kwargs, expected, request, using_infer_string +): data = """\ A,B,C a,1,one @@ -317,8 +316,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request): with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), **kwargs) return - mark = pytest.mark.xfail() - request.applymarker(mark) + if not using_infer_string or "na_values" in kwargs: + mark = pytest.mark.xfail() + request.applymarker(mark) result = parser.read_csv(StringIO(data), **kwargs) expected = DataFrame(expected) @@ -429,8 +429,6 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", [ @@ -438,14 +436,21 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), ], ) -def test_na_values_na_filter_override(all_parsers, na_filter, row_data): +def test_na_values_na_filter_override( + request, all_parsers, na_filter, row_data, using_infer_string +): + parser = all_parsers + if parser.engine == "pyarrow": + # mismatched dtypes in both cases, FutureWarning in the True case + if not (using_infer_string and na_filter): + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.applymarker(mark) data = """\ A,B 1,A nan,B 3,C """ - parser = all_parsers result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) expected = DataFrame(row_data, columns=["A", "B"]) @@ -536,7 +541,6 @@ def test_na_values_dict_aliasing(all_parsers): tm.assert_dict_equal(na_values, na_values_copy) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_na_values_dict_null_column_name(all_parsers): # see gh-57547 parser = all_parsers @@ -560,11 +564,10 @@ def test_na_values_dict_null_column_name(all_parsers): return expected = DataFrame( - {None: ["MA", "NA", "OA"], "x": [1.0, 2.0, np.nan], "y": [2.0, 1.0, 3.0]} + {"x": [1.0, 2.0, np.nan], "y": [2.0, 1.0, 3.0]}, + index=Index(["MA", "NA", "OA"], dtype=object), ) - expected = expected.set_index(None) - result = parser.read_csv( StringIO(data), index_col=0, diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 532fcc5cd880c..1411ed5019766 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -13,8 +13,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -421,7 +419,6 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", @@ -429,7 +426,7 @@ def test_parse_timezone(all_parsers): ) def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers - expected = DataFrame({0: [date_string]}, dtype="object") + expected = DataFrame({0: [date_string]}, dtype="str") result = parser.read_csv( StringIO(date_string), header=None, @@ -609,7 +606,6 @@ def test_date_parser_usecols_thousands(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dayfirst_warnings(): # GH 12585 @@ -642,7 +638,7 @@ def test_dayfirst_warnings(): # first in DD/MM/YYYY, second in MM/DD/YYYY input = "date\n31/12/2014\n03/30/2011" - expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") + expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date") # A. use dayfirst=True res5 = read_csv( @@ -752,7 +748,6 @@ def test_parse_dates_and_string_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_parse_dot_separated_dates(all_parsers): # https://github.com/pandas-dev/pandas/issues/2586 parser = all_parsers @@ -762,7 +757,7 @@ def test_parse_dot_separated_dates(all_parsers): if parser.engine == "pyarrow": expected_index = Index( ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"], - dtype="object", + dtype="str", name="a", ) warn = None diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index 01e576ba40f26..bc4c4c2e24e9c 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.parsers import ( _maybe_upcast, na_values, @@ -86,7 +84,6 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) def test_maybe_upcast_object(val, string_storage): # GH#36712 diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index d3b4bb0ea6c72..47658c0eb9012 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -795,7 +795,7 @@ def test_append_raise(setup_path): # series directly msg = re.escape( "cannot properly create the storer for: " - "[group->df,value->]" + "[group->df,value->]" ) with pytest.raises(TypeError, match=msg): store.append("df", Series(np.arange(10))) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 606b19ac0ed75..16c3c6798ff76 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -37,12 +37,11 @@ pytestmark = [ pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) -def test_mode(setup_path, tmp_path, mode): +def test_mode(setup_path, tmp_path, mode, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -91,10 +90,12 @@ def test_mode(setup_path, tmp_path, mode): read_hdf(path, "df", mode=mode) else: result = read_hdf(path, "df", mode=mode) + if using_infer_string: + df.columns = df.columns.astype("str") tm.assert_frame_equal(result, df) -def test_default_mode(tmp_path, setup_path): +def test_default_mode(tmp_path, setup_path, using_infer_string): # read_hdf uses default mode df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -104,7 +105,10 @@ def test_default_mode(tmp_path, setup_path): path = tmp_path / setup_path df.to_hdf(path, key="df", mode="w") result = read_hdf(path, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) def test_reopen_handle(tmp_path, setup_path): @@ -163,7 +167,7 @@ def test_reopen_handle(tmp_path, setup_path): assert not store.is_open -def test_open_args(setup_path): +def test_open_args(setup_path, using_infer_string): with tm.ensure_clean(setup_path) as path: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -178,8 +182,13 @@ def test_open_args(setup_path): store["df"] = df store.append("df2", df) - tm.assert_frame_equal(store["df"], df) - tm.assert_frame_equal(store["df2"], df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + + tm.assert_frame_equal(store["df"], expected) + tm.assert_frame_equal(store["df2"], expected) store.close() @@ -194,7 +203,7 @@ def test_flush(setup_path): store.flush(fsync=True) -def test_complibs_default_settings(tmp_path, setup_path): +def test_complibs_default_settings(tmp_path, setup_path, using_infer_string): # GH15943 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -207,7 +216,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complevel=9) result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -218,7 +231,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complib="zlib") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -229,7 +246,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -308,6 +329,7 @@ def test_complibs(tmp_path, lvl, lib, request): assert node.filters.complib == lib +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.skipif( not is_platform_little_endian(), reason="reason platform is not little endian" ) @@ -325,6 +347,7 @@ def test_encoding(setup_path): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "val", [ diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index a4257b54dd6db..66596f1138b96 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp import pandas as pd @@ -26,7 +24,6 @@ pytestmark = [ pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @@ -54,8 +51,8 @@ def test_api_default_format(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with pd.option_context("io.hdf.default_format", "fixed"): @@ -79,8 +76,8 @@ def test_api_default_format(tmp_path, setup_path): path = tmp_path / setup_path df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with pd.option_context("io.hdf.default_format", "fixed"): @@ -106,7 +103,7 @@ def test_put(setup_path): ) df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) store["a"] = ts @@ -166,7 +163,7 @@ def test_put_compression(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -183,7 +180,7 @@ def test_put_compression(setup_path): def test_put_compression_blosc(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -197,10 +194,20 @@ def test_put_compression_blosc(setup_path): tm.assert_frame_equal(store["c"], df) -def test_put_mixed_type(setup_path, performance_warning): +def test_put_datetime_ser(setup_path, performance_warning, using_infer_string): + # https://github.com/pandas-dev/pandas/pull/60663 + ser = Series(3 * [Timestamp("20010102").as_unit("ns")]) + with ensure_clean_store(setup_path) as store: + store.put("ser", ser) + expected = ser.copy() + result = store.get("ser") + tm.assert_series_equal(result, expected) + + +def test_put_mixed_type(setup_path, performance_warning, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["obj1"] = "foo" @@ -220,13 +227,42 @@ def test_put_mixed_type(setup_path, performance_warning): with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") - with tm.assert_produces_warning(performance_warning): + warning = None if using_infer_string else performance_warning + with tm.assert_produces_warning(warning): store.put("df", df) expected = store.get("df") tm.assert_frame_equal(expected, df) +def test_put_str_frame(setup_path, performance_warning, string_dtype_arguments): + # https://github.com/pandas-dev/pandas/pull/60663 + dtype = pd.StringDtype(*string_dtype_arguments) + df = DataFrame({"a": pd.array(["x", pd.NA, "y"], dtype=dtype)}) + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + store.put("df", df) + expected_dtype = "str" if dtype.na_value is np.nan else "string" + expected = df.astype(expected_dtype) + result = store.get("df") + tm.assert_frame_equal(result, expected) + + +def test_put_str_series(setup_path, performance_warning, string_dtype_arguments): + # https://github.com/pandas-dev/pandas/pull/60663 + dtype = pd.StringDtype(*string_dtype_arguments) + ser = Series(["x", pd.NA, "y"], dtype=dtype) + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + store.put("ser", ser) + expected_dtype = "str" if dtype.na_value is np.nan else "string" + expected = ser.astype(expected_dtype) + result = store.get("ser") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format", ["table", "fixed"]) @pytest.mark.parametrize( "index", @@ -253,7 +289,7 @@ def test_store_index_types(setup_path, format, index): tm.assert_frame_equal(df, store["df"]) -def test_column_multiindex(setup_path): +def test_column_multiindex(setup_path, using_infer_string): # GH 4710 # recreate multi-indexes properly @@ -264,6 +300,12 @@ def test_column_multiindex(setup_path): expected = df.set_axis(df.index.to_numpy()) with ensure_clean_store(setup_path) as store: + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + store.put("df", df) + return store.put("df", df) tm.assert_frame_equal( store["df"], expected, check_index_type=True, check_column_type=True diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py index bbe1cd77e0d9f..03622faa2b5a8 100644 --- a/pandas/tests/io/pytables/test_subclass.py +++ b/pandas/tests/io/pytables/test_subclass.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -19,7 +17,6 @@ class TestHDFStoreSubclass: # GH 33748 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_supported_for_subclass_dataframe(self, tmp_path): data = {"a": [1, 2], "b": [3, 4]} sdf = tm.SubclassedDataFrame(data, dtype=np.intp) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 3f5b73f4aa8a4..a17cd27f8284e 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat._constants import ( IS64, WASM, @@ -20,10 +18,6 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - @pytest.fixture def dirpath(datapath): @@ -246,11 +240,13 @@ def test_zero_variables(datapath): pd.read_sas(fname) -def test_zero_rows(datapath): +@pytest.mark.parametrize("encoding", [None, "utf8"]) +def test_zero_rows(datapath, encoding): # GH 18198 fname = datapath("io", "sas", "data", "zero_rows.sas7bdat") - result = pd.read_sas(fname) - expected = pd.DataFrame([{"char_field": "a", "num_field": 1.0}]).iloc[:0] + result = pd.read_sas(fname, encoding=encoding) + str_value = b"a" if encoding is None else "a" + expected = pd.DataFrame([{"char_field": str_value, "num_field": 1.0}]).iloc[:0] tm.assert_frame_equal(result, expected) @@ -409,7 +405,7 @@ def test_0x40_control_byte(datapath): fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat") df = pd.read_sas(fname, encoding="ascii") fname = datapath("io", "sas", "data", "0x40controlbyte.csv") - df0 = pd.read_csv(fname, dtype="object") + df0 = pd.read_csv(fname, dtype="str") tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 541cc39606047..b5e97314caf03 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( PyperclipException, PyperclipWindowsException, @@ -26,10 +24,6 @@ init_qt_clipboard, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def build_kwargs(sep, excel): kwargs = {} @@ -351,7 +345,7 @@ def test_raw_roundtrip(self, data): @pytest.mark.parametrize("engine", ["c", "python"]) def test_read_clipboard_dtype_backend( - self, clipboard, string_storage, dtype_backend, engine + self, clipboard, string_storage, dtype_backend, engine, using_infer_string ): # GH#50502 if dtype_backend == "pyarrow": @@ -396,6 +390,11 @@ def test_read_clipboard_dtype_backend( ) expected["g"] = ArrowExtensionArray(pa.array([None, None])) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) def test_invalid_dtype_backend(self): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 10e3af601b7ef..e162815271ab3 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -19,12 +19,12 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( WASM, is_platform_windows, ) +from pandas.compat.pyarrow import pa_version_under19p0 +import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -139,7 +139,6 @@ def test_bytesiowrapper_returns_correct_bytes(self): assert result == data.encode("utf-8") # Test that pyarrow can handle a file opened with get_handle - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_handle_pyarrow_compat(self): pa_csv = pytest.importorskip("pyarrow.csv") @@ -154,6 +153,8 @@ def test_get_handle_pyarrow_compat(self): s = StringIO(data) with icom.get_handle(s, "rb", is_text=False) as handles: df = pa_csv.read_csv(handles.handle).to_pandas() + if pa_version_under19p0: + expected = expected.astype("object") tm.assert_frame_equal(df, expected) assert not s.closed @@ -337,7 +338,6 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), ], ) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_write_fspath_all(self, writer_name, writer_kwargs, module): if writer_name in ["to_latex"]: # uses Styler implementation pytest.importorskip("jinja2") @@ -364,7 +364,6 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll @@ -437,14 +436,13 @@ def test_unknown_engine(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path) with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_binary_mode(self): """ 'encoding' shouldn't be passed to 'open' in binary mode. @@ -454,8 +452,8 @@ def test_binary_mode(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path, mode="w+b") tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) @@ -472,8 +470,8 @@ def test_warning_missing_utf_bom(self, encoding, compression_): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with tm.assert_produces_warning(UnicodeWarning, match="byte order mark"): @@ -503,15 +501,14 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with codecs.open(path, mode="w", encoding=encoding) as handle: @@ -524,13 +521,12 @@ def test_codecs_encoding(encoding, format): tm.assert_frame_equal(expected, df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_codecs_get_writer_reader(): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with open(path, "wb") as handle: @@ -555,8 +551,8 @@ def test_explicit_encoding(io_class, mode, msg): # wrong mode is requested expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with io_class() as buffer: with pytest.raises(TypeError, match=msg): @@ -642,6 +638,19 @@ def close(self): handles.created_handles.append(TestError()) +@td.skip_if_no("fsspec", min_version="2023.1.0") +@pytest.mark.parametrize("compression", [None, "infer"]) +def test_read_csv_chained_url_no_error(compression): + # GH 60100 + tar_file_path = "pandas/tests/io/data/tar/test-csv.tar" + chained_file_url = f"tar://test.csv::file://{tar_file_path}" + + result = pd.read_csv(chained_file_url, compression=compression, sep=";") + expected = pd.DataFrame({"1": {0: 3}, "2": {0: 4}}) + + tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize( "reader", [ @@ -660,3 +669,17 @@ def test_pickle_reader(reader): # GH 22265 with BytesIO() as buffer: pickle.dump(reader, buffer) + + +@td.skip_if_no("pyarrow") +def test_pyarrow_read_csv_datetime_dtype(): + # GH 59904 + data = '"date"\n"20/12/2025"\n""\n"31/12/2020"' + result = pd.read_csv( + StringIO(data), parse_dates=["date"], dayfirst=True, dtype_backend="pyarrow" + ) + + expect_data = pd.to_datetime(["20/12/2025", pd.NaT, "31/12/2020"], dayfirst=True) + expect = pd.DataFrame({"date": expect_data}) + + tm.assert_frame_equal(expect, result) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 5eb202dd5aa24..fd1e9b4fdf211 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -12,8 +12,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas as pd @@ -139,7 +137,6 @@ def test_compression_warning(compression_only): df.to_csv(handles.handle, compression=compression_only) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_compression_binary(compression_only): """ Binary file handles support compression. @@ -148,8 +145,8 @@ def test_compression_binary(compression_only): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) # with a file @@ -180,8 +177,8 @@ def test_gzip_reproducibility_file_name(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} @@ -203,8 +200,8 @@ def test_gzip_reproducibility_file_object(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 9721d045b7b91..e778193c147c1 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -1,10 +1,16 @@ """test feather-format compat""" +from datetime import datetime import zoneinfo import numpy as np import pytest +from pandas.compat.pyarrow import ( + pa_version_under18p0, + pa_version_under19p0, +) + import pandas as pd import pandas._testing as tm @@ -137,8 +143,8 @@ def test_rw_use_threads(self): def test_path_pathlib(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() result = tm.round_trip_pathlib(df.to_feather, read_feather) tm.assert_frame_equal(df, result) @@ -236,14 +242,55 @@ def test_invalid_dtype_backend(self): with pytest.raises(ValueError, match=msg): read_feather(path, dtype_backend="numpy") - def test_string_inference(self, tmp_path): + def test_string_inference(self, tmp_path, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}) df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) + dtype = pd.StringDtype(na_value=np.nan) expected = pd.DataFrame( data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) ) + expected = pd.DataFrame( + data={"a": ["x", "y"]}, + dtype=dtype, + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0") + def test_string_inference_string_view_type(self, tmp_path): + # GH#54798 + import pyarrow as pa + from pyarrow import feather + + path = tmp_path / "string_view.parquet" + table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())}) + feather.write_feather(table, path) + + with pd.option_context("future.infer_string", True): + result = read_feather(path) + + expected = pd.DataFrame( + data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan) + ) tm.assert_frame_equal(result, expected) + + def test_out_of_bounds_datetime_to_feather(self): + # GH#47832 + df = pd.DataFrame( + { + "date": [ + datetime.fromisoformat("1654-01-01"), + datetime.fromisoformat("1920-01-01"), + ], + } + ) + self.check_round_trip(df) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index aa9c47ea0e63c..2e3e74a9d31ff 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -5,6 +5,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, date_range, @@ -176,7 +178,9 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string) fastparquet" +) def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -205,7 +209,6 @@ def test_arrowparquet_options(fsspectest): assert fsspectest.test[0] == "parquet_read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index a9e7b2da03a4d..f68ef5fa2e0e5 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under17p0 from pandas import ( @@ -158,7 +156,6 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", ["utf-8", "cp1251"]) def test_to_csv_compression_encoding_gcs( gcs_buffer, compression_only, encoding, compression_to_extension @@ -171,8 +168,8 @@ def test_to_csv_compression_encoding_gcs( """ df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) # reference of compressed and encoded file @@ -208,7 +205,6 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 73e9933e3681b..bef28c4f027da 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1004,6 +1004,33 @@ def test_rowspan_only_rows(self, flavor_read_html): tm.assert_frame_equal(result, expected) + def test_rowspan_in_header_overflowing_to_body(self, flavor_read_html): + # GH60210 + + result = flavor_read_html( + StringIO( + """ + + + + + + + + + + + + +
AB
1
C2
+ """ + ) + )[0] + + expected = DataFrame(data=[["A", 1], ["C", 2]], columns=["A", "B"]) + + tm.assert_frame_equal(result, expected) + def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): # GH17054 result = flavor_read_html( diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py index b11fe931f46e5..3b9c8769ad9dc 100644 --- a/pandas/tests/io/test_http_headers.py +++ b/pandas/tests/io/test_http_headers.py @@ -86,7 +86,6 @@ def stata_responder(df): return bio.getvalue() -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "responder, read_method", [ @@ -107,6 +106,7 @@ def stata_responder(df): marks=[ td.skip_if_no("fastparquet"), td.skip_if_no("fsspec"), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string"), ], ), (pickle_respnder, pd.read_pickle), diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 6ef7105cf5ccc..56a8e4c439164 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -17,6 +17,7 @@ pa_version_under13p0, pa_version_under15p0, pa_version_under17p0, + pa_version_under19p0, ) import pandas as pd @@ -103,10 +104,7 @@ def fp(request): @pytest.fixture def df_compat(): - # TODO(infer_string) should this give str columns? - return pd.DataFrame( - {"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object) - ) + return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"])) @pytest.fixture @@ -254,8 +252,10 @@ def test_invalid_engine(df_compat): check_round_trip(df_compat, "foo", "bar") -def test_options_py(df_compat, pa): +def test_options_py(df_compat, pa, using_infer_string): # use the set option + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") with pd.option_context("io.parquet.engine", "pyarrow"): check_round_trip(df_compat) @@ -683,7 +683,11 @@ def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: httpserver.serve_content(content=f.read()) df = read_parquet(httpserver.url, engine=engine) - tm.assert_frame_equal(df, df_compat) + + expected = df_compat + if pa_version_under19p0: + expected.columns = expected.columns.astype(object) + tm.assert_frame_equal(df, expected) class TestParquetPyArrow(Base): @@ -784,18 +788,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type): def test_categorical(self, pa): # supported in >= 0.7.0 - df = pd.DataFrame() - df["a"] = pd.Categorical(list("abcdef")) - - # test for null, out-of-order values, and unobserved category - df["b"] = pd.Categorical( - ["bar", "foo", "foo", "bar", None, "bar"], - dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), - ) - - # test for ordered flag - df["c"] = pd.Categorical( - ["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True + df = pd.DataFrame( + { + "a": pd.Categorical(list("abcdef")), + # test for null, out-of-order values, and unobserved category + "b": pd.Categorical( + ["bar", "foo", "foo", "bar", None, "bar"], + dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), + ), + # test for ordered flag + "c": pd.Categorical( + ["a", "b", "c", "a", "c", "b"], + categories=["b", "c", "d"], + ordered=True, + ), + } ) check_round_trip(df, pa) @@ -858,11 +865,13 @@ def test_s3_roundtrip_for_dir( repeat=1, ) - def test_read_file_like_obj_support(self, df_compat): + def test_read_file_like_obj_support(self, df_compat, using_infer_string): pytest.importorskip("pyarrow") buffer = BytesIO() df_compat.to_parquet(buffer) df_from_buf = read_parquet(buffer) + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") tm.assert_frame_equal(df_compat, df_from_buf) def test_expand_user(self, df_compat, monkeypatch): @@ -929,7 +938,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string): "c": pd.Series(["a", None, "c"], dtype="string"), } ) - if using_infer_string: + if using_infer_string and pa_version_under19p0: check_round_trip(df, pa, expected=df.astype({"c": "str"})) else: check_round_trip(df, pa) @@ -943,7 +952,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) with pd.option_context("string_storage", string_storage): if using_infer_string: - expected = df.astype("str") + if pa_version_under19p0: + expected = df.astype("str") + else: + expected = df.astype(f"string[{string_storage}]") expected.columns = expected.columns.astype("str") else: expected = df.astype(f"string[{string_storage}]") @@ -1099,17 +1111,24 @@ def test_df_attrs_persistence(self, tmp_path, pa): new_df = read_parquet(path, engine=pa) assert new_df.attrs == df.attrs - def test_string_inference(self, tmp_path, pa): + def test_string_inference(self, tmp_path, pa, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"]) - df.to_parquet(path, engine="pyarrow") + df.to_parquet(path, engine=pa) with pd.option_context("future.infer_string", True): - result = read_parquet(path, engine="pyarrow") + result = read_parquet(path, engine=pa) + dtype = pd.StringDtype(na_value=np.nan) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype=pd.StringDtype(na_value=np.nan), - index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + dtype=dtype, + index=pd.Index(["a", "b"], dtype=dtype), + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), ) tm.assert_frame_equal(result, expected) @@ -1122,7 +1141,10 @@ def test_roundtrip_decimal(self, tmp_path, pa): df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]") df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))])) result = read_parquet(path) - expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + if pa_version_under19p0: + expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + else: + expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object") tm.assert_frame_equal(result, expected) def test_infer_string_large_string_type(self, tmp_path, pa): @@ -1172,11 +1194,33 @@ def test_non_nanosecond_timestamps(self, temp_file): ) tm.assert_frame_equal(result, expected) + def test_maps_as_pydicts(self, pa): + pyarrow = pytest.importorskip("pyarrow", "13.0.0") + + schema = pyarrow.schema( + [("foo", pyarrow.map_(pyarrow.string(), pyarrow.int64()))] + ) + df = pd.DataFrame([{"foo": {"A": 1}}, {"foo": {"B": 2}}]) + check_round_trip( + df, + pa, + write_kwargs={"schema": schema}, + read_kwargs={"to_pandas_kwargs": {"maps_as_pydicts": "strict"}}, + ) + class TestParquetFastParquet(Base): - @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values") - def test_basic(self, fp, df_full): + def test_basic(self, fp, df_full, request): pytz = pytest.importorskip("pytz") + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0"): + request.applymarker( + pytest.mark.xfail( + reason=("datetime_with_nat gets incorrect values"), + ) + ) + tz = pytz.timezone("US/Eastern") df = df_full @@ -1213,11 +1257,17 @@ def test_duplicate_columns(self, fp): msg = "Cannot create parquet dataset with duplicate column names" self.check_error_on_write(df, fp, ValueError, msg) - @pytest.mark.xfail( - Version(np.__version__) >= Version("2.0.0"), - reason="fastparquet uses np.float_ in numpy2", - ) - def test_bool_with_none(self, fp): + def test_bool_with_none(self, fp, request): + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0") and Version( + np.__version__ + ) >= Version("2.0.0"): + request.applymarker( + pytest.mark.xfail( + reason=("fastparquet uses np.float_ in numpy2"), + ) + ) df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") # Fastparquet bug in 0.7.1 makes it so that this dtype becomes @@ -1331,10 +1381,19 @@ def test_empty_dataframe(self, fp): expected = df.copy() check_round_trip(df, fp, expected=expected) - @pytest.mark.xfail( - reason="fastparquet bug, see https://github.com/dask/fastparquet/issues/929" - ) - def test_timezone_aware_index(self, fp, timezone_aware_date_list): + def test_timezone_aware_index(self, fp, timezone_aware_date_list, request): + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0"): + request.applymarker( + pytest.mark.xfail( + reason=( + "fastparquet bug, see " + "https://github.com/dask/fastparquet/issues/929" + ), + ) + ) + idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index c28a33069d23f..7e1220ecee218 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -60,7 +60,7 @@ pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), + pytest.mark.single_cpu, ] @@ -237,14 +237,17 @@ def types_table_metadata(dialect: str): "types", metadata, Column("TextCol", TEXT), - Column("DateCol", date_type), + # error: Cannot infer type argument 1 of "Column" + Column("DateCol", date_type), # type: ignore[misc] Column("IntDateCol", Integer), Column("IntDateOnlyCol", Integer), Column("FloatCol", Float), Column("IntCol", Integer), - Column("BoolCol", bool_type), + # error: Cannot infer type argument 1 of "Column" + Column("BoolCol", bool_type), # type: ignore[misc] Column("IntColWithNull", Integer), - Column("BoolColWithNull", bool_type), + # error: Cannot infer type argument 1 of "Column" + Column("BoolColWithNull", bool_type), # type: ignore[misc] ) return types @@ -682,6 +685,7 @@ def postgresql_psycopg2_conn(postgresql_psycopg2_engine): @pytest.fixture def postgresql_adbc_conn(): + pytest.importorskip("pyarrow") pytest.importorskip("adbc_driver_postgresql") from adbc_driver_postgresql import dbapi @@ -814,6 +818,7 @@ def sqlite_conn_types(sqlite_engine_types): @pytest.fixture def sqlite_adbc_conn(): + pytest.importorskip("pyarrow") pytest.importorskip("adbc_driver_sqlite") from adbc_driver_sqlite import dbapi @@ -954,12 +959,12 @@ def sqlite_buildin_types(sqlite_buildin, types_data): adbc_connectable_iris = [ pytest.param("postgresql_adbc_iris", marks=pytest.mark.db), - pytest.param("sqlite_adbc_iris", marks=pytest.mark.db), + "sqlite_adbc_iris", ] adbc_connectable_types = [ pytest.param("postgresql_adbc_types", marks=pytest.mark.db), - pytest.param("sqlite_adbc_types", marks=pytest.mark.db), + "sqlite_adbc_types", ] @@ -983,13 +988,13 @@ def test_dataframe_to_sql(conn, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_empty(conn, test_frame1, request): - if conn == "postgresql_adbc_conn": + if conn == "postgresql_adbc_conn" and not using_string_dtype(): request.node.add_marker( pytest.mark.xfail( - reason="postgres ADBC driver cannot insert index with null type", - strict=True, + reason="postgres ADBC driver < 1.2 cannot insert index with null type", ) ) + # GH 51086 if conn is sqlite_engine conn = request.getfixturevalue(conn) empty_df = test_frame1.iloc[:0] @@ -3554,7 +3559,8 @@ def test_read_sql_dtype_backend( result = getattr(pd, func)( f"Select * from {table}", conn, dtype_backend=dtype_backend ) - expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -3604,7 +3610,7 @@ def test_read_sql_dtype_backend_table( with pd.option_context("mode.string_storage", string_storage): result = getattr(pd, func)(table, conn, dtype_backend=dtype_backend) - expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -4120,7 +4126,7 @@ def tquery(query, con=None): def test_xsqlite_basic(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) assert sql.to_sql(frame, name="test_table", con=sqlite_buildin, index=False) == 10 @@ -4147,7 +4153,7 @@ def test_xsqlite_basic(sqlite_buildin): def test_xsqlite_write_row_by_row(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) frame.iloc[0, 0] = np.nan @@ -4170,7 +4176,7 @@ def test_xsqlite_write_row_by_row(sqlite_buildin): def test_xsqlite_execute(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) create_sql = sql.get_schema(frame, "test") @@ -4191,7 +4197,7 @@ def test_xsqlite_execute(sqlite_buildin): def test_xsqlite_schema(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) create_sql = sql.get_schema(frame, "test") diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 8fa85d13bbdb5..9288b98d79fbe 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1719,7 +1719,6 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted - # @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("byteorder", ["little", "big"]) def test_writer_117(self, byteorder, temp_file, using_infer_string): original = DataFrame( diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index b39f953da1ee6..d18f098267599 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -774,6 +774,16 @@ def test_bar_nan_stacked(self): expected = [0.0, 0.0, 0.0, 10.0, 0.0, 20.0, 15.0, 10.0, 40.0] assert result == expected + def test_bar_stacked_label_position_with_zero_height(self): + # GH 59429 + df = DataFrame({"A": [3, 0, 1], "B": [0, 2, 4], "C": [5, 0, 2]}) + ax = df.plot.bar(stacked=True) + ax.bar_label(ax.containers[-1]) + expected = [8.0, 2.0, 7.0] + result = [text.xy[1] for text in ax.texts] + tm.assert_almost_equal(result, expected) + plt.close("all") + @pytest.mark.parametrize("idx", [Index, pd.CategoricalIndex]) def test_bar_categorical(self, idx): # GH 13019 @@ -1060,28 +1070,43 @@ def test_boxplot_series_positions(self, hist_df): tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] # if horizontal, yticklabels are rotated - ax = df.plot.box(rot=50, fontsize=8, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(rot=50, fontsize=8, **kwargs) _check_ticks_props(ax, xrot=0, yrot=50, ylabelsize=8) _check_text_labels(ax.get_yticklabels(), labels) assert len(ax.lines) == 7 * len(numeric_cols) - @pytest.mark.filterwarnings("ignore:Attempt:UserWarning") + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib version 3.10", + ) def test_boxplot_vertical_subplots(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) axes = _check_plot_works( - df.plot.box, - default_axes=True, - subplots=True, - vert=False, - logx=True, + df.plot.box, default_axes=True, subplots=True, logx=True, **kwargs ) _check_axes_shape(axes, axes_num=3, layout=(1, 3)) _check_ax_scales(axes, xaxis="log") @@ -1089,12 +1114,22 @@ def test_boxplot_vertical_subplots(self, hist_df): _check_text_labels(ax.get_yticklabels(), [label]) assert len(ax.lines) == 7 + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical_positions(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] positions = np.array([3, 2, 8]) - ax = df.plot.box(positions=positions, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(positions=positions, **kwargs) _check_text_labels(ax.get_yticklabels(), labels) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) @@ -2579,6 +2614,14 @@ def test_plot_period_index_makes_no_right_shift(self, freq): result = ax.get_lines()[0].get_xdata() assert all(str(result[i]) == str(expected[i]) for i in range(4)) + def test_plot_display_xlabel_and_xticks(self): + # GH#44050 + df = DataFrame(np.random.default_rng(2).random((10, 2)), columns=["a", "b"]) + ax = df.plot.hexbin(x="a", y="b") + + _check_visible([ax.xaxis.get_label()], visible=True) + _check_visible(ax.get_xticklabels(), visible=True) + def _generate_4_axes_via_gridspec(): gs = mpl.gridspec.GridSpec(2, 2) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 4916963ab7c87..2267b6197cd80 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -1,5 +1,7 @@ """Test cases for .boxplot method""" +from __future__ import annotations + import itertools import string @@ -22,6 +24,7 @@ _check_ticks_props, _check_visible, ) +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -35,6 +38,17 @@ def _check_ax_limits(col, ax): assert y_max >= col.max() +if Version(mpl.__version__) < Version("3.10"): + verts: list[dict[str, bool | str]] = [{"vert": False}, {"vert": True}] +else: + verts = [{"orientation": "horizontal"}, {"orientation": "vertical"}] + + +@pytest.fixture(params=verts) +def vert(request): + return request.param + + class TestDataFramePlots: def test_stacked_boxplot_set_axis(self): # GH2980 @@ -312,7 +326,7 @@ def test_specified_props_kwd(self, props, expected): assert result[expected][0].get_color() == "C1" - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -322,11 +336,11 @@ def test_plot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.plot(kind="box", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.plot(kind="box", xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_box(self, vert): # GH 54941 rng = np.random.default_rng(2) @@ -335,13 +349,13 @@ def test_plot_box(self, vert): xlabel, ylabel = "x", "y" _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) - df1.plot.box(ax=axs[0], vert=vert, xlabel=xlabel, ylabel=ylabel) - df2.plot.box(ax=axs[1], vert=vert, xlabel=xlabel, ylabel=ylabel) + df1.plot.box(ax=axs[0], xlabel=xlabel, ylabel=ylabel, **vert) + df2.plot.box(ax=axs[1], xlabel=xlabel, ylabel=ylabel, **vert) for ax in axs: assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -351,11 +365,11 @@ def test_boxplot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_group_xlabel_ylabel(self, vert): df = DataFrame( { @@ -365,13 +379,19 @@ def test_boxplot_group_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(by="group", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(by="group", xlabel=xlabel, ylabel=ylabel, **vert) for subplot in ax: assert subplot.get_xlabel() == xlabel assert subplot.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) - def test_boxplot_group_no_xlabel_ylabel(self, vert): + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + def test_boxplot_group_no_xlabel_ylabel(self, vert, request): + if Version(mpl.__version__) >= Version("3.10") and vert == { + "orientation": "horizontal" + }: + request.applymarker( + pytest.mark.xfail(reason=f"{vert} fails starting with matplotlib 3.10") + ) df = DataFrame( { "a": np.random.default_rng(2).standard_normal(10), @@ -379,9 +399,13 @@ def test_boxplot_group_no_xlabel_ylabel(self, vert): "group": np.random.default_rng(2).choice(["group1", "group2"], 10), } ) - ax = df.boxplot(by="group", vert=vert) + ax = df.boxplot(by="group", **vert) for subplot in ax: - target_label = subplot.get_xlabel() if vert else subplot.get_ylabel() + target_label = ( + subplot.get_xlabel() + if vert == {"vert": True} or vert == {"orientation": "vertical"} + else subplot.get_ylabel() + ) assert target_label == pprint_thing(["group"]) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 52ca66c218862..9675b936c171e 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -958,3 +958,16 @@ def test_plot_no_warning(self, ts): # TODO(3.0): this can be removed once Period[B] deprecation is enforced with tm.assert_produces_warning(False): _ = ts.plot() + + def test_secondary_y_subplot_axis_labels(self): + # GH#14102 + s1 = Series([5, 7, 6, 8, 7], index=[1, 2, 3, 4, 5]) + s2 = Series([6, 4, 5, 3, 4], index=[1, 2, 3, 4, 5]) + + ax = plt.subplot(2, 1, 1) + s1.plot(ax=ax) + s2.plot(ax=ax, secondary_y=True) + ax2 = plt.subplot(2, 1, 2) + s1.plot(ax=ax2) + assert len(ax.xaxis.get_minor_ticks()) == 0 + assert len(ax.get_xticklabels()) > 0 diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 8153ba66d632b..a7bb80727206e 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -7,10 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd from pandas import ( Categorical, @@ -1206,10 +1202,6 @@ def test_idxminmax_object_dtype(self, using_infer_string): with pytest.raises(TypeError, match=msg): ser3.idxmin(skipna=False) - # TODO(infer_string) implement argmin/max for python string dtype - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_idxminmax_object_frame(self): # GH#4279 df = DataFrame([["zimm", 2.5], ["biff", 1.0], ["bid", 12.0]]) @@ -1615,17 +1607,10 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) - def test_mode_sortwarning(self): - # Check for the warning that is raised when the mode - # results cannot be sorted - - expected = Series(["foo", np.nan], dtype=object) + def test_mode_sort_with_na(self): s = Series([1, "foo", "foo", np.nan, np.nan]) - - with tm.assert_produces_warning(UserWarning, match="Unable to sort modes"): - result = s.mode(dropna=False) - result = result.sort_values().reset_index(drop=True) - + expected = Series(["foo", np.nan], dtype=object) + result = s.mode(dropna=False) tm.assert_series_equal(result, expected) def test_mode_boolean_with_na(self): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 179f2c0e6cfa9..3a7fd548ca961 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1022,12 +1022,8 @@ def test_resample_segfault(unit): all_wins_and_wagers, columns=("ID", "timestamp", "A", "B") ).set_index("timestamp") df.index = df.index.as_unit(unit) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("ID").resample("5min").sum() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) + result = df.groupby("ID").resample("5min").sum() + expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) tm.assert_frame_equal(result, expected) @@ -1046,9 +1042,7 @@ def test_resample_dtype_preservation(unit): result = df.resample("1D").ffill() assert result.val.dtype == np.int32 - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group").resample("1D").ffill() + result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 @@ -1821,12 +1815,8 @@ def f(data, add_arg): multiplier = 10 df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("A").resample("D").mean().multiply(multiplier) + result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) + expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index a8fb1b392322d..da1774cf22587 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -76,9 +76,7 @@ def test_groupby_resample_api(): ) index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] + result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] tm.assert_frame_equal(result, expected) @@ -187,7 +185,7 @@ def test_api_compat_before_use(attr): getattr(rs, attr) -def tests_raises_on_nuisance(test_frame): +def tests_raises_on_nuisance(test_frame, using_infer_string): df = test_frame df["D"] = "foo" r = df.resample("h") @@ -197,6 +195,8 @@ def tests_raises_on_nuisance(test_frame): expected = r[["A", "B", "C"]].mean() msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): r.mean() result = r.mean(numeric_only=True) @@ -881,7 +881,9 @@ def test_end_and_end_day_origin( ("sem", lib.no_default, "could not convert string to float"), ], ) -def test_frame_downsample_method(method, numeric_only, expected_data): +def test_frame_downsample_method( + method, numeric_only, expected_data, using_infer_string +): # GH#46442 test if `numeric_only` behave as expected for DataFrameGroupBy index = date_range("2018-01-01", periods=2, freq="D") @@ -898,6 +900,11 @@ def test_frame_downsample_method(method, numeric_only, expected_data): if method in ("var", "mean", "median", "prod"): klass = TypeError msg = re.escape(f"agg function failed [how->{method},dtype->") + if using_infer_string: + msg = f"dtype 'str' does not support operation '{method}'" + elif method in ["sum", "std", "sem"] and using_infer_string: + klass = TypeError + msg = f"dtype 'str' does not support operation '{method}'" else: klass = ValueError msg = expected_data @@ -932,7 +939,9 @@ def test_frame_downsample_method(method, numeric_only, expected_data): ("last", lib.no_default, ["cat_2"]), ], ) -def test_series_downsample_method(method, numeric_only, expected_data): +def test_series_downsample_method( + method, numeric_only, expected_data, using_infer_string +): # GH#46442 test if `numeric_only` behave as expected for SeriesGroupBy index = date_range("2018-01-01", periods=2, freq="D") @@ -948,8 +957,11 @@ def test_series_downsample_method(method, numeric_only, expected_data): func(**kwargs) elif method == "prod": msg = re.escape("agg function failed [how->prod,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'prod'" with pytest.raises(TypeError, match=msg): func(**kwargs) + else: result = func(**kwargs) expected = Series(expected_data, index=expected_index) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index ff1b82210e20d..e7850f96b3b0f 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -71,12 +71,8 @@ def test_deferred_with_groupby(): def f_0(x): return x.set_index("date").resample("D").asfreq() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("id").apply(f_0) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.set_index("date").groupby("id").resample("D").asfreq() + expected = df.groupby("id").apply(f_0) + result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) df = DataFrame( @@ -90,12 +86,8 @@ def f_0(x): def f_1(x): return x.resample("1D").ffill() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("group").apply(f_1) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group").resample("1D").ffill() + expected = df.groupby("group").apply(f_1) + result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected) @@ -110,9 +102,7 @@ def test_getitem(test_frame): result = g.B.resample("2s").mean() tm.assert_series_equal(result, expected) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.resample("2s").mean().B + result = g.resample("2s").mean().B tm.assert_series_equal(result, expected) @@ -236,12 +226,8 @@ def test_methods(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) tm.assert_equal(result, expected) @@ -258,12 +244,8 @@ def test_methods_nunique(test_frame): def test_methods_std_var(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = getattr(r, f)(ddof=1) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) + result = getattr(r, f)(ddof=1) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -272,24 +254,18 @@ def test_apply(test_frame): r = g.resample("2s") # reduction - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.resample("2s").sum() + expected = g.resample("2s").sum() def f_0(x): return x.resample("2s").sum() - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = r.apply(f_0) + result = r.apply(f_0) tm.assert_frame_equal(result, expected) def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(f_1) + result = g.apply(f_1) # y.sum() results in int64 instead of int32 on 32-bit architectures expected = expected.astype("int64") tm.assert_frame_equal(result, expected) @@ -357,9 +333,7 @@ def test_resample_groupby_with_label(unit): # GH 13235 index = date_range("2000-01-01", freq="2D", periods=5, unit=unit) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("col0").resample("1W", label="left").sum() + result = df.groupby("col0").resample("1W", label="left").sum() mi = [ np.array([0, 0, 1, 2], dtype=np.int64), @@ -369,9 +343,7 @@ def test_resample_groupby_with_label(unit): ), ] mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None]) - expected = DataFrame( - data={"col0": [0, 0, 2, 2], "col1": [1, 1, 2, 1]}, index=mindex - ) + expected = DataFrame(data={"col1": [1, 1, 2, 1]}, index=mindex) tm.assert_frame_equal(result, expected) @@ -380,9 +352,7 @@ def test_consistency_with_window(test_frame): # consistent return values with window df = test_frame expected = Index([1, 2, 3], name="A") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").resample("2s").mean() + result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -479,13 +449,12 @@ def test_resample_groupby_agg_listlike(): def test_empty(keys): # GH 26411 df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + expected_columns = ["b"] if keys == ["a"] else [] expected = ( DataFrame(columns=["a", "b"]) .set_index(keys, drop=False) - .set_index(TimedeltaIndex([]), append=True) + .set_index(TimedeltaIndex([]), append=True)[expected_columns] ) if len(keys) == 1: expected.index.name = keys[0] @@ -505,9 +474,7 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): if consolidate: df = df._consolidate() - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["key"]).resample("W", on="date").min() + result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ ["A"] * 3 + ["B"] * 3, @@ -519,7 +486,6 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): ) expected = DataFrame( { - "key": ["A"] * 3 + ["B"] * 3, "col1": [0, 5, 12] * 2, "col_object": ["val"] * 3 + [np.nan] * 3, }, @@ -557,12 +523,11 @@ def test_resample_no_index(keys): df = DataFrame([], columns=["a", "b", "date"]) df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + expected_columns = ["b"] if keys == ["a"] else [] expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False) expected["date"] = pd.to_datetime(expected["date"]) - expected = expected.set_index("date", append=True, drop=True) + expected = expected.set_index("date", append=True, drop=True)[expected_columns] if len(keys) == 1: expected.index.name = keys[0] @@ -606,9 +571,7 @@ def test_groupby_resample_size_all_index_same(): {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)}, index=date_range("31/12/2000 18:00", freq="h", periods=12), ) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").resample("D").size() + result = df.groupby("A").resample("D").size() mi_exp = pd.MultiIndex.from_arrays( [ diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index f694b90a707c7..30e2c9dfe3d30 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -351,14 +351,11 @@ def test_groupby_resample_interpolate_raises(groupy_test_df): dfs = [groupy_test_df, groupy_test_df_without_index_name] for df in dfs: - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - with pytest.raises( - NotImplementedError, - match="Direct interpolation of MultiIndex data frames is " - "not supported", - ): - df.groupby("volume").resample("1D").interpolate(method="linear") + with pytest.raises( + NotImplementedError, + match="Direct interpolation of MultiIndex data frames is " "not supported", + ): + df.groupby("volume").resample("1D").interpolate(method="linear") def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): @@ -373,7 +370,6 @@ def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): for df in dfs: result = df.groupby("volume").apply( lambda x: x.resample("1D").interpolate(method="linear"), - include_groups=False, ) volume = [50] * 15 + [60] @@ -417,7 +413,7 @@ def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df) See GH#21351.""" # GH#21351 result = groupy_test_df.groupby("volume").apply( - lambda x: x.resample("265h").interpolate(method="linear"), include_groups=False + lambda x: x.resample("265h").interpolate(method="linear") ) volume = [50, 50, 60] diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 0f743332acbbe..65bfea0b9beea 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -620,7 +620,7 @@ def test_join_non_unique_period_index(self): ) tm.assert_frame_equal(result, expected) - def test_mixed_type_join_with_suffix(self): + def test_mixed_type_join_with_suffix(self, using_infer_string): # GH #916 df = DataFrame( np.random.default_rng(2).standard_normal((20, 6)), @@ -631,6 +631,8 @@ def test_mixed_type_join_with_suffix(self): grouped = df.groupby("id") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.mean() mn = grouped.mean(numeric_only=True) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d4766242b8460..f0abc1afc6ab0 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1843,6 +1843,41 @@ def test_merge_empty(self, left_empty, how, exp): tm.assert_frame_equal(result, expected) + def test_merge_with_uintc_columns(self): + df1 = DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.uintc)}) + df2 = DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.uintc)}) + result = df1.merge(df2, how="outer") + expected = DataFrame( + { + "a": ["bar", "baz", "foo", "foo"], + "b": np.array([2, 4, 1, 3], dtype=np.uintc), + } + ) + tm.assert_frame_equal(result.reset_index(drop=True), expected) + + def test_merge_with_intc_columns(self): + df1 = DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.intc)}) + df2 = DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.intc)}) + result = df1.merge(df2, how="outer") + expected = DataFrame( + { + "a": ["bar", "baz", "foo", "foo"], + "b": np.array([2, 4, 1, 3], dtype=np.intc), + } + ) + tm.assert_frame_equal(result.reset_index(drop=True), expected) + + def test_merge_intc_non_monotonic(self): + df = DataFrame({"join_key": Series([0, 2, 1], dtype=np.intc)}) + df_details = DataFrame( + {"join_key": Series([0, 1, 2], dtype=np.intc), "value": ["a", "b", "c"]} + ) + merged = df.merge(df_details, on="join_key", how="left") + expected = DataFrame( + {"join_key": np.array([0, 2, 1], dtype=np.intc), "value": ["a", "c", "b"]} + ) + tm.assert_frame_equal(merged.reset_index(drop=True), expected) + @pytest.fixture def left(): diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index d8bb4fba1e1fe..63332fe4658e5 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -733,6 +733,7 @@ def test_cut_with_duplicated_index_lowest_included(): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_cut_with_nonexact_categorical_indices(): # GH 42424 diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index da1930323f464..c7b7992a78232 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -364,7 +362,6 @@ def test_with_prefix_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "default_category, expected", [ @@ -401,12 +398,14 @@ def test_with_prefix_contains_get_dummies_NaN_column(): ], ) def test_with_prefix_default_category( - dummies_with_unassigned, default_category, expected + dummies_with_unassigned, default_category, expected, using_infer_string ): result = from_dummies( dummies_with_unassigned, sep="_", default_category=default_category ) expected = DataFrame(expected) + if using_infer_string: + expected = expected.astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index eccf676b87f89..374d236c8ff39 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -935,12 +935,14 @@ def test_margins(self, data): for value_col in table.columns.levels[0]: self._check_output(table[value_col], value_col, data) - def test_no_col(self, data): + def test_no_col(self, data, using_infer_string): # no col # to help with a buglet data.columns = [k * 2 for k in data.columns] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") table = data.drop(columns="CC").pivot_table( @@ -990,7 +992,7 @@ def test_no_col(self, data): ], ) def test_margin_with_only_columns_defined( - self, columns, aggfunc, values, expected_columns + self, columns, aggfunc, values, expected_columns, using_infer_string ): # GH 31016 df = DataFrame( @@ -1014,6 +1016,8 @@ def test_margin_with_only_columns_defined( ) if aggfunc != "sum": msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) if "B" not in columns: @@ -2372,9 +2376,13 @@ def test_pivot_table_with_margins_and_numeric_columns(self): tm.assert_frame_equal(result, expected) - def test_pivot_ea_dtype_dropna(self, dropna): + @pytest.mark.parametrize( + "dtype,expected_dtype", [("Int64", "Float64"), ("int64", "float64")] + ) + def test_pivot_ea_dtype_dropna(self, dropna, dtype, expected_dtype): # GH#47477 - df = DataFrame({"x": "a", "y": "b", "age": Series([20, 40], dtype="Int64")}) + # GH#47971 + df = DataFrame({"x": "a", "y": "b", "age": Series([20, 40], dtype=dtype)}) result = df.pivot_table( index="x", columns="y", values="age", aggfunc="mean", dropna=dropna ) @@ -2382,7 +2390,7 @@ def test_pivot_ea_dtype_dropna(self, dropna): [[30]], index=Index(["a"], name="x"), columns=Index(["b"], name="y"), - dtype="Float64", + dtype=expected_dtype, ) tm.assert_frame_equal(result, expected) @@ -2660,6 +2668,8 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() + # this still fails because columns=None gets passed down to unstack as level=None + # while at that point None was converted to NaN @pytest.mark.xfail( using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" ) @@ -2678,10 +2688,7 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" - ) - def test_pivot_index_is_none(self): + def test_pivot_index_is_none(self, using_infer_string): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2692,11 +2699,10 @@ def test_pivot_index_is_none(self): result = df.pivot(columns="b", index=None, values="c") expected = DataFrame(3, index=[1], columns=Index([2], name="b")) + if using_infer_string: + expected.index.name = np.nan tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" - ) def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 1d5d16f39e648..081feae6fc43f 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.concat import union_categoricals import pandas as pd @@ -124,12 +122,15 @@ def test_union_categoricals_nan(self): exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [[], ["1"]]) def test_union_categoricals_empty(self, val, request, using_infer_string): # GH 13759 if using_infer_string and val == ["1"]: - request.applymarker(pytest.mark.xfail("object and strings dont match")) + request.applymarker( + pytest.mark.xfail( + reason="TDOD(infer_string) object and strings dont match" + ) + ) res = union_categoricals([Categorical([]), Categorical(val)]) exp = Categorical(val) tm.assert_categorical_equal(res, exp) diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 7aa6c6c0496a9..d65d425620c84 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -314,6 +314,17 @@ def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): assert result == expected + def test_timestamp_dst_transition(self): + # GH 60084 + dt_str = "2023-11-05 01:00-08:00" + tz_str = "America/Los_Angeles" + + ts1 = Timestamp(dt_str, tz=tz_str) + ts2 = ts1 + Timedelta(hours=0) + + assert ts1 == ts2 + assert hash(ts1) == hash(ts2) + class SubDatetime(datetime): pass diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 885adb3543b46..2c441a6ed91c1 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -10,8 +10,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs.timezones import maybe_get_tz from pandas.core.dtypes.common import ( @@ -556,7 +554,6 @@ def test_strftime(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_days(self): ser = Series(date_range("20130101", periods=5)) ser.iloc[0] = pd.NaT @@ -571,7 +568,6 @@ def test_strftime_dt64_days(self): expected = Index( ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], - dtype=np.object_, ) # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index c153e800cb534..bec8ca13a2f5f 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -25,9 +25,10 @@ def test_list_getitem(list_dtype): ser = Series( [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(list_dtype), + name="a", ) actual = ser.list[1] - expected = Series([2, None, None], dtype="int64[pyarrow]") + expected = Series([2, None, None], dtype="int64[pyarrow]", name="a") tm.assert_series_equal(actual, expected) @@ -37,9 +38,15 @@ def test_list_getitem_index(): [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), index=[1, 3, 7], + name="a", ) actual = ser.list[1] - expected = Series([2, None, None], dtype="int64[pyarrow]", index=[1, 3, 7]) + expected = Series( + [2, None, None], + dtype="int64[pyarrow]", + index=[1, 3, 7], + name="a", + ) tm.assert_series_equal(actual, expected) @@ -48,6 +55,7 @@ def test_list_getitem_slice(): [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), index=[1, 3, 7], + name="a", ) if pa_version_under11p0: with pytest.raises( @@ -60,6 +68,7 @@ def test_list_getitem_slice(): [[2, 3], [None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), index=[1, 3, 7], + name="a", ) tm.assert_series_equal(actual, expected) @@ -68,9 +77,10 @@ def test_list_len(): ser = Series( [[1, 2, 3], [4, None], None], dtype=ArrowDtype(pa.list_(pa.int64())), + name="a", ) actual = ser.list.len() - expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32())) + expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32()), name="a") tm.assert_series_equal(actual, expected) @@ -78,12 +88,14 @@ def test_list_flatten(): ser = Series( [[1, 2, 3], None, [4, None], [], [7, 8]], dtype=ArrowDtype(pa.list_(pa.int64())), + name="a", ) actual = ser.list.flatten() expected = Series( [1, 2, 3, 4, None, 7, 8], dtype=ArrowDtype(pa.int64()), index=[0, 0, 0, 2, 2, 4, 4], + name="a", ) tm.assert_series_equal(actual, expected) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 9f310d8c8ab5f..d3556b644c4bf 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas import ( @@ -251,18 +249,29 @@ def test_slice(string_series, object_series): tm.assert_series_equal(string_series, original) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_timedelta_assignment(): # GH 8209 s = Series([], dtype=object) s.loc["B"] = timedelta(1) - tm.assert_series_equal(s, Series(Timedelta("1 days"), index=["B"])) + expected = Series( + Timedelta("1 days"), dtype="timedelta64[ns]", index=Index(["B"], dtype=object) + ) + tm.assert_series_equal(s, expected) s = s.reindex(s.index.insert(0, "A")) - tm.assert_series_equal(s, Series([np.nan, Timedelta("1 days")], index=["A", "B"])) + expected = Series( + [np.nan, Timedelta("1 days")], + dtype="timedelta64[ns]", + index=Index(["A", "B"], dtype=object), + ) + tm.assert_series_equal(s, expected) s.loc["A"] = timedelta(1) - expected = Series(Timedelta("1 days"), index=["A", "B"]) + expected = Series( + Timedelta("1 days"), + dtype="timedelta64[ns]", + index=Index(["A", "B"], dtype=object), + ) tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 789e3ac752097..49c933c308235 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -9,12 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import ( - HAS_PYARROW, - WASM, -) from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import IndexingError @@ -32,6 +26,7 @@ NaT, Period, Series, + StringDtype, Timedelta, Timestamp, array, @@ -535,17 +530,18 @@ def test_append_timedelta_does_not_cast(self, td, using_infer_string, request): tm.assert_series_equal(ser, expected) assert isinstance(ser["td"], Timedelta) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_with_expansion_type_promotion(self): # GH#12599 ser = Series(dtype=object) ser["a"] = Timestamp("2016-01-01") ser["b"] = 3.0 ser["c"] = "foo" - expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) + expected = Series( + [Timestamp("2016-01-01"), 3.0, "foo"], + index=Index(["a", "b", "c"], dtype=object), + ) tm.assert_series_equal(ser, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_not_contained(self, string_series): # set item that's not contained ser = string_series.copy() @@ -827,11 +823,6 @@ def test_mask_key(self, obj, key, expected, raises, val, indexer_sli): else: indexer_sli(obj)[mask] = val - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_series_where(self, obj, key, expected, raises, val, is_inplace): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -859,30 +850,20 @@ def test_series_where(self, obj, key, expected, raises, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - def test_index_where(self, obj, key, expected, raises, val, using_infer_string): + def test_index_where(self, obj, key, expected, raises, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if using_infer_string and obj.dtype == object: - with pytest.raises(TypeError, match="Scalar must"): - Index(obj).where(~mask, val) - else: - res = Index(obj).where(~mask, val) - expected_idx = Index(expected, dtype=expected.dtype) - tm.assert_index_equal(res, expected_idx) + res = Index(obj).where(~mask, val) + expected_idx = Index(expected, dtype=expected.dtype) + tm.assert_index_equal(res, expected_idx) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - def test_index_putmask(self, obj, key, expected, raises, val, using_infer_string): + def test_index_putmask(self, obj, key, expected, raises, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if using_infer_string and obj.dtype == object: - with pytest.raises(TypeError, match="Scalar must"): - Index(obj).putmask(mask, val) - else: - res = Index(obj).putmask(mask, val) - tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) + res = Index(obj).putmask(mask, val) + tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) @pytest.mark.parametrize( @@ -1373,6 +1354,19 @@ def raises(self): return False +@pytest.mark.parametrize( + "val,exp_dtype,raises", + [ + (1, object, True), + ("e", StringDtype(na_value=np.nan), False), + ], +) +class TestCoercionString(CoercionTest): + @pytest.fixture + def obj(self): + return Series(["a", "b", "c", "d"], dtype=StringDtype(na_value=np.nan)) + + @pytest.mark.parametrize( "val,exp_dtype,raises", [ @@ -1454,7 +1448,6 @@ def obj(self): np_version_gte1p24 and os.environ.get("NPY_PROMOTION_STATE", "weak") != "weak" ) - or WASM ), reason="np.float32(1.1) ends up as 1.100000023841858, so " "np_can_hold_element raises and we cast to float64", diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 053c290999f2f..663ee8ad0ee38 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import is_integer import pandas as pd @@ -231,7 +229,6 @@ def test_where_ndframe_align(): tm.assert_series_equal(out, expected) -@pytest.mark.xfail(using_string_dtype(), reason="can't set ints into string") def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment @@ -241,7 +238,7 @@ def test_where_setitem_invalid(): "different length than the value" ) # slice - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[0:3] = list(range(27)) @@ -251,18 +248,18 @@ def test_where_setitem_invalid(): tm.assert_series_equal(s.astype(np.int64), expected) # slice with step - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[0:4:2] = list(range(27)) - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) s[0:4:2] = list(range(2)) expected = Series([0, "b", 1, "d", "e", "f"]) tm.assert_series_equal(s, expected) # neg slices - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[:-1] = list(range(27)) @@ -272,18 +269,18 @@ def test_where_setitem_invalid(): tm.assert_series_equal(s, expected) # list - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("list-like")): s[[0, 1, 2]] = list(range(27)) - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("list-like")): s[[0, 1, 2]] = list(range(2)) # scalar - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) s[0] = list(range(10)) expected = Series([list(range(10)), "b", "c"]) tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index c1082c06ce307..019efe8683347 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -66,7 +66,7 @@ def test_argsort_stable(self): tm.assert_series_equal(qindexer.astype(np.intp), Series(qexpected)) msg = ( r"ndarray Expected type , " - r"found instead" + r"found instead" ) with pytest.raises(AssertionError, match=msg): tm.assert_numpy_array_equal(qindexer, mindexer) diff --git a/pandas/tests/series/methods/test_info.py b/pandas/tests/series/methods/test_info.py index 097976b0a7ac0..88f2cf384fc79 100644 --- a/pandas/tests/series/methods/test_info.py +++ b/pandas/tests/series/methods/test_info.py @@ -7,10 +7,14 @@ from pandas._config import using_string_dtype -from pandas.compat import PYPY +from pandas.compat import ( + HAS_PYARROW, + PYPY, +) from pandas import ( CategoricalIndex, + Index, MultiIndex, Series, date_range, @@ -41,7 +45,9 @@ def test_info_categorical(): @pytest.mark.parametrize("verbose", [True, False]) -def test_info_series(lexsorted_two_level_string_multiindex, verbose): +def test_info_series( + lexsorted_two_level_string_multiindex, verbose, using_infer_string +): index = lexsorted_two_level_string_multiindex ser = Series(range(len(index)), index=index, name="sth") buf = StringIO() @@ -50,7 +56,7 @@ def test_info_series(lexsorted_two_level_string_multiindex, verbose): expected = textwrap.dedent( """\ - + MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') """ ) @@ -63,10 +69,11 @@ def test_info_series(lexsorted_two_level_string_multiindex, verbose): 10 non-null int64 """ ) + qualifier = "" if using_infer_string and HAS_PYARROW else "+" expected += textwrap.dedent( f"""\ dtypes: int64(1) - memory usage: {ser.memory_usage()}.0+ bytes + memory usage: {ser.memory_usage()}.0{qualifier} bytes """ ) assert result == expected @@ -80,7 +87,7 @@ def test_info_memory(): memory_bytes = float(s.memory_usage()) expected = textwrap.dedent( f"""\ - + RangeIndex: 2 entries, 0 to 1 Series name: None Non-Null Count Dtype @@ -142,14 +149,17 @@ def test_info_memory_usage_deep_pypy(): assert s_object.memory_usage(deep=True) == s_object.memory_usage() -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "index, plus", [ ([1, 2, 3], False), - (list("ABC"), True), + (Index(list("ABC"), dtype="str"), not (using_string_dtype() and HAS_PYARROW)), + (Index(list("ABC"), dtype=object), True), (MultiIndex.from_product([range(3), range(3)]), False), - (MultiIndex.from_product([range(3), ["foo", "bar"]]), True), + ( + MultiIndex.from_product([range(3), ["foo", "bar"]]), + not (using_string_dtype() and HAS_PYARROW), + ), ], ) def test_info_memory_usage_qualified(index, plus): diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index e997ae32cf2e2..4f8484252ba8f 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -211,6 +211,30 @@ def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "dtype, data, values, expected", + [ + ("boolean", [pd.NA, False, True], [False, pd.NA], [True, True, False]), + ("Int64", [pd.NA, 2, 1], [1, pd.NA], [True, False, True]), + ("boolean", [pd.NA, False, True], [pd.NA, True, "a", 20], [True, False, True]), + ("boolean", [pd.NA, False, True], [], [False, False, False]), + ("Float64", [20.0, 30.0, pd.NA], [pd.NA], [False, False, True]), + ], +) +def test_isin_large_series_and_pdNA(dtype, data, values, expected, monkeypatch): + # https://github.com/pandas-dev/pandas/issues/60678 + # combination of large series (> _MINIMUM_COMP_ARR_LEN elements) and + # values contains pdNA + min_isin_comp = 2 + ser = Series(data, dtype=dtype) + expected = Series(expected, dtype="boolean") + + with monkeypatch.context() as m: + m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp) + result = ser.isin(values) + tm.assert_series_equal(result, expected) + + def test_isin_complex_numbers(): # GH 17927 array = [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j] diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 068446a5e216b..442d73cadfe47 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -23,7 +23,7 @@ def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) - assert np.may_share_memory(string_series.index, identity.index) + assert tm.shares_memory(string_series.index, identity.index) assert identity.index.is_(string_series.index) assert identity.index.identical(string_series.index) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 611fcc114db6c..abd5d075ea3d5 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -628,11 +626,17 @@ def test_replace_nullable_numeric(self): with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 1 in string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 - series = pd.Series(["0"]) + series = pd.Series(["0"], dtype=object) + expected = pd.Series([1], dtype=object) + result = series.replace(to_replace="0", value=1, regex=regex) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("regex", [False, True]) + def test_replace_regex_dtype_series_string(self, regex): + series = pd.Series(["0"], dtype="str") expected = pd.Series([1], dtype=object) result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) @@ -656,21 +660,18 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_replace_change_dtype_series(self, using_infer_string): + def test_replace_change_dtype_series(self): # GH#25797 - df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) - warn = FutureWarning if using_infer_string else None - with tm.assert_produces_warning(warn, match="Downcasting"): - df["Test"] = df["Test"].replace([True], [np.nan]) - expected = pd.DataFrame.from_dict({"Test": ["0.5", np.nan, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object) + df["Test"] = df["Test"].replace([True], [np.nan]) + expected = pd.DataFrame({"Test": ["0.5", np.nan, "0.6"]}, dtype=object) tm.assert_frame_equal(df, expected) - df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", None, "0.6"]}, dtype=object) df["Test"] = df["Test"].replace([None], [np.nan]) tm.assert_frame_equal(df, expected) - df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", None, "0.6"]}, dtype=object) df["Test"] = df["Test"].fillna(np.nan) tm.assert_frame_equal(df, expected) @@ -707,3 +708,10 @@ def test_replace_ea_float_with_bool(self): expected = ser.copy() result = ser.replace(0.0, True) tm.assert_series_equal(result, expected) + + def test_replace_all_NA(self): + # GH#60688 + df = pd.Series([pd.NA, pd.NA]) + result = df.replace({r"^#": "$"}, regex=True) + expected = pd.Series([pd.NA, pd.NA]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 6eb7c74d2eca0..3e3eb36112680 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import Series import pandas._testing as tm @@ -26,7 +24,6 @@ def read_csv(self, path, **kwargs): return out - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_csv(self, datetime_series, string_series, temp_file): # freq doesn't round-trip datetime_series.index = datetime_series.index._with_freq(None) diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index 8c4f0ff3eaea7..f61e20c43657d 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -136,11 +134,10 @@ def test_unstack_mixed_type_name_in_multiindex( tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_unstack_multi_index_categorical_values(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) mi = df.stack().index.rename(["major", "minor"]) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 79a55eb357f87..4b369bb0bc869 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -4,10 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd from pandas import ( DataFrame, @@ -164,12 +160,8 @@ def test_attrs(self): result = s + 1 assert result.attrs == {"version": 1} - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_inspect_getmembers(self): # GH38782 - pytest.importorskip("jinja2") ser = Series(dtype=object) inspect.getmembers(ser) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index a9d5486139b46..89882d9d797c5 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -6,6 +6,8 @@ tests.frame.test_cumulative """ +import re + import numpy as np import pytest @@ -227,3 +229,55 @@ def test_cumprod_timedelta(self): ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)]) with pytest.raises(TypeError, match="cumprod not supported for Timedelta"): ser.cumprod() + + @pytest.mark.parametrize( + "data, op, skipna, expected_data", + [ + ([], "cumsum", True, []), + ([], "cumsum", False, []), + (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]), + (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]), + (["x", pd.NA, "y"], "cumsum", True, ["x", pd.NA, "xy"]), + (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cumsum", True, [pd.NA, "x", "xy"]), + ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummin", True, []), + ([], "cummin", False, []), + (["y", "z", "x"], "cummin", True, ["y", "y", "x"]), + (["y", "z", "x"], "cummin", False, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", True, ["y", pd.NA, "x"]), + (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]), + ([pd.NA, "y", "x"], "cummin", True, [pd.NA, "y", "x"]), + ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummax", True, []), + ([], "cummax", False, []), + (["x", "z", "y"], "cummax", True, ["x", "z", "z"]), + (["x", "z", "y"], "cummax", False, ["x", "z", "z"]), + (["x", pd.NA, "y"], "cummax", True, ["x", pd.NA, "y"]), + (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cummax", True, [pd.NA, "x", "y"]), + ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ], + ) + def test_cum_methods_pyarrow_strings( + self, pyarrow_string_dtype, data, op, skipna, expected_data + ): + # https://github.com/pandas-dev/pandas/pull/60633 + ser = pd.Series(data, dtype=pyarrow_string_dtype) + method = getattr(ser, op) + expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) + result = method(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): + # https://github.com/pandas-dev/pandas/pull/60633 + ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) + msg = re.escape(f"operation 'cumprod' not supported for dtype '{ser.dtype}'") + with pytest.raises(TypeError, match=msg): + ser.cumprod(skipna=skipna) diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index ab083d5c58b35..eb81840f6f8f9 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -143,11 +141,13 @@ def test_tidy_repr_name_0(self, arg): rep_str = repr(ser) assert "Name: 0" in rep_str - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string): investigate failure" - ) - def test_newline(self): - ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) + def test_newline(self, any_string_dtype): + ser = Series( + ["a\n\r\tb"], + name="a\n\r\td", + index=Index(["a\n\r\tf"], dtype=any_string_dtype), + dtype=any_string_dtype, + ) assert "\t" not in repr(ser) assert "\r" not in repr(ser) assert "a\n" not in repr(ser) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 8516018e8aa93..8f63819b09238 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -413,6 +413,7 @@ def test_logical_ops_label_based(self, using_infer_string): for e in [Series(["z"])]: if using_infer_string: # TODO(infer_string) should this behave differently? + # -> https://github.com/pandas-dev/pandas/issues/60234 with pytest.raises( TypeError, match="not supported for dtype|unsupported operand type" ): diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 92b7b16da3c1f..5bcbb16da3be9 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -68,6 +68,7 @@ "get_dummies", "isalnum", "isalpha", + "isascii", "isdecimal", "isdigit", "islower", diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 34a6377b5786f..30e6ebf0eed13 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -293,23 +293,12 @@ def test_startswith_endswith_validate_na(any_string_dtype): dtype=any_string_dtype, ) - dtype = ser.dtype - if (isinstance(dtype, pd.StringDtype)) or dtype == np.dtype("object"): - msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser.str.startswith("kapow", na="baz") - msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser.str.endswith("bar", na="baz") - else: - # TODO(infer_string): don't surface pyarrow errors - import pyarrow as pa - - msg = "Could not convert 'baz' with type str: tried to convert to boolean" - with pytest.raises(pa.lib.ArrowInvalid, match=msg): - ser.str.startswith("kapow", na="baz") - with pytest.raises(pa.lib.ArrowInvalid, match=msg): - ser.str.endswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.startswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.endswith("bar", na="baz") @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 3b989e284ca25..16e10c6fcdccd 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas import ( @@ -13,11 +11,6 @@ _testing as tm, ) -try: - import pyarrow as pa -except ImportError: - pa = None - def test_get_dummies(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) @@ -98,30 +91,12 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): # GH#47872 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_dummies_with_str_dtype(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=str) - expected = DataFrame( - [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]], - columns=list("abc"), - dtype=str, - ) - tm.assert_frame_equal(result, expected) + msg = "Only numeric or boolean dtypes are supported for 'dtype'" + with pytest.raises(ValueError, match=msg): + s.str.get_dummies("|", dtype=str) -# GH#47872 -@td.skip_if_no("pyarrow") -def test_get_dummies_with_pa_str_dtype(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype="str[pyarrow]") - expected = DataFrame( - [ - ["true", "true", "false"], - ["true", "false", "true"], - ["false", "false", "false"], - ], - columns=list("abc"), - dtype="str[pyarrow]", - ) - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match=msg): + s.str.get_dummies("|", dtype="datetime64[ns]") diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index cd3c512328139..c5414022e664b 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -83,6 +83,7 @@ def test_string_array_numeric_integer_array(nullable_string_dtype, method, expec [ ("isdigit", [False, None, True]), ("isalpha", [True, None, False]), + ("isascii", [True, None, True]), ("isalnum", [True, None, True]), ("isnumeric", [False, None, True]), ], diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 75a2007b61640..ee531b32aa82d 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -95,6 +95,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) + empty_inferred_str = Series(dtype="str") if is_object_or_nan_string_dtype(any_string_dtype): empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) @@ -154,11 +155,12 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) - tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii")) + tm.assert_series_equal(empty_inferred_str, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) # ismethods should always return boolean (GH 29624) tm.assert_series_equal(empty_bool, empty.str.isalnum()) tm.assert_series_equal(empty_bool, empty.str.isalpha()) + tm.assert_series_equal(empty_bool, empty.str.isascii()) tm.assert_series_equal(empty_bool, empty.str.isdigit()) tm.assert_series_equal(empty_bool, empty.str.isspace()) tm.assert_series_equal(empty_bool, empty.str.islower()) @@ -177,6 +179,7 @@ def test_empty_str_methods(any_string_dtype): @pytest.mark.parametrize( "method, expected", [ + ("isascii", [True, True, True, True, True, True, True, True, True, True]), ("isalnum", [True, True, True, True, True, False, True, True, False, False]), ("isalpha", [True, True, True, False, False, False, True, False, False, False]), ( @@ -564,7 +567,7 @@ def test_string_slice_out_of_bounds(any_string_dtype): def test_encode_decode(any_string_dtype): ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") result = ser.str.decode("utf-8") - expected = ser.map(lambda x: x.decode("utf-8")).astype(object) + expected = Series(["a", "b", "a\xe4"], dtype="str") tm.assert_series_equal(result, expected) @@ -594,7 +597,7 @@ def test_decode_errors_kwarg(): ser.str.decode("cp1252") result = ser.str.decode("cp1252", "ignore") - expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object) + expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype("str") tm.assert_series_equal(result, expected) @@ -749,5 +752,5 @@ def test_get_with_dict_label(): def test_series_str_decode(): # GH 22613 result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict") - expected = Series(["x", "y"], dtype="object") + expected = Series(["x", "y"], dtype="str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index dac74a0e32a42..611b92eb022d6 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import ( algos as libalgos, hashtable as ht, @@ -1256,7 +1254,7 @@ def test_value_counts_nat(self): result_dt = algos.value_counts_internal(dt) tm.assert_series_equal(result_dt, exp_dt) - exp_td = Series({np.timedelta64(10000): 1}, name="count") + exp_td = Series([1], index=[np.timedelta64(10000)], name="count") result_td = algos.value_counts_internal(td) tm.assert_series_equal(result_td, exp_td) @@ -1684,12 +1682,17 @@ def test_unique_complex_numbers(self, array, expected): class TestHashTable: - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ - (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), - (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + ( + ht.PyObjectHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), + ( + ht.StringHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), @@ -1697,7 +1700,7 @@ class TestHashTable: ) def test_hashtable_unique(self, htable, data, writable): # output of maker has guaranteed unique elements - s = Series(data) + s = Series(data, dtype=data.dtype) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan @@ -1724,12 +1727,17 @@ def test_hashtable_unique(self, htable, data, writable): reconstr = result_unique[result_inverse] tm.assert_numpy_array_equal(reconstr, s_duplicated.values) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ - (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), - (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + ( + ht.PyObjectHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), + ( + ht.StringHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), @@ -1737,7 +1745,7 @@ def test_hashtable_unique(self, htable, data, writable): ) def test_hashtable_factorize(self, htable, writable, data): # output of maker has guaranteed unique elements - s = Series(data) + s = Series(data, dtype=data.dtype) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index e87498742061b..a23e6d9b3973a 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -295,6 +295,29 @@ def test_multiindex_insert_level_with_na(self, na): df[na, "B"] = 1 tm.assert_frame_equal(df[na], DataFrame([1], columns=["B"])) + def test_multiindex_dt_with_nan(self): + # GH#60388 + df = DataFrame( + [ + [1, np.nan, 5, np.nan], + [2, np.nan, 6, np.nan], + [np.nan, 3, np.nan, 7], + [np.nan, 4, np.nan, 8], + ], + index=Series(["a", "b", "c", "d"], dtype=object, name="sub"), + columns=MultiIndex.from_product( + [ + ["value1", "value2"], + [datetime.datetime(2024, 11, 1), datetime.datetime(2024, 11, 2)], + ], + names=[None, "Date"], + ), + ) + df = df.reset_index() + result = df[df.columns[0]] + expected = Series(["a", "b", "c", "d"], name=("sub", np.nan)) + tm.assert_series_equal(result, expected) + class TestSorted: """everything you wanted to test about sorting""" diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index ce41f1e76de79..e7ed8e855a762 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -537,11 +537,8 @@ def _argminmax_wrap(self, value, axis=None, func=None): nullnan = isna(nans) if res.ndim: res[nullnan] = -1 - elif ( - hasattr(nullnan, "all") - and nullnan.all() - or not hasattr(nullnan, "all") - and nullnan + elif (hasattr(nullnan, "all") and nullnan.all()) or ( + not hasattr(nullnan, "all") and nullnan ): res = -1 return res diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index a9d3c235f63f6..74b051aec71a4 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2084,6 +2084,18 @@ def test_dataframe_str_dtype(self, df, cache): ) tm.assert_series_equal(result, expected) + def test_dataframe_float32_dtype(self, df, cache): + # GH#60506 + # coerce to float64 + result = to_datetime(df.astype(np.float32), cache=cache) + expected = Series( + [ + Timestamp("20150204 06:58:10.001002003"), + Timestamp("20160305 07:59:11.001002003"), + ] + ) + tm.assert_series_equal(result, expected) + def test_dataframe_coerce(self, cache): # passing coerce df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) @@ -3668,3 +3680,12 @@ def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_fir to_datetime(vec, format="mixed") with pytest.raises(ValueError, match=msg): DatetimeIndex(vec) + + +def test_to_datetime_wrapped_datetime64_ps(): + # GH#60341 + result = to_datetime([np.datetime64(1901901901901, "ps")]) + expected = DatetimeIndex( + ["1970-01-01 00:00:01.901901901"], dtype="datetime64[ns]", freq=None + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 943434e515828..809d8f87b2c02 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -8,12 +8,16 @@ tests, or when trying to pin down the bugs exposed by the tests below. """ +import zoneinfo + from hypothesis import ( assume, given, ) import pytest +from pandas.compat import WASM + import pandas as pd from pandas._testing._hypothesis import ( DATETIME_JAN_1_1900_OPTIONAL_TZ, @@ -28,6 +32,15 @@ @given(DATETIME_JAN_1_1900_OPTIONAL_TZ, YQM_OFFSET) def test_on_offset_implementations(dt, offset): assume(not offset.normalize) + # This case is flaky in CI 2024-11-04 + assume( + not ( + WASM + and isinstance(dt.tzinfo, zoneinfo.ZoneInfo) + and dt.tzinfo.key == "Indian/Cocos" + and isinstance(offset, pd.offsets.MonthBegin) + ) + ) # check that the class-specific implementations of is_on_offset match # the general case definition: # (dt + offset) - offset == dt diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index 8dd7060f21d59..60bbcf08ce8e7 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -144,7 +144,7 @@ def test_maybe_get_tz_invalid_types(): with pytest.raises(TypeError, match=""): timezones.maybe_get_tz(pytest) - msg = "" + msg = "" with pytest.raises(TypeError, match=msg): timezones.maybe_get_tz(Timestamp("2021-01-01", tz="UTC")) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 15eaa8c167487..877b50e37670c 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -177,6 +177,38 @@ def test_agg_nested_dicts(): r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) +@pytest.mark.parametrize( + "func,window_size", + [ + ( + "rolling", + 2, + ), + ( + "expanding", + None, + ), + ], +) +def test_pipe(func, window_size): + # Issue #57076 + df = DataFrame( + { + "B": np.random.default_rng(2).standard_normal(10), + "C": np.random.default_rng(2).standard_normal(10), + } + ) + r = getattr(df, func)(window_size) + + expected = r.max() - r.mean() + result = r.pipe(lambda x: x.max() - x.mean()) + tm.assert_frame_equal(result, expected) + + expected = r.max() - 2 * r.min() + result = r.pipe(lambda x, k: x.max() - k * x.min(), k=2) + tm.assert_frame_equal(result, expected) + + def test_count_nonnumeric_types(step): # GH12541 cols = [ diff --git a/pandas/tests/window/test_cython_aggregations.py b/pandas/tests/window/test_cython_aggregations.py index c60cb6ea74ec0..feb25a294c540 100644 --- a/pandas/tests/window/test_cython_aggregations.py +++ b/pandas/tests/window/test_cython_aggregations.py @@ -30,6 +30,8 @@ def _get_rolling_aggregations(): ("roll_median_c", window_aggregations.roll_median_c), ("roll_max", window_aggregations.roll_max), ("roll_min", window_aggregations.roll_min), + ("roll_first", window_aggregations.roll_first), + ("roll_last", window_aggregations.roll_last), ] + [ ( diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index b2f76bdd0e2ad..39cedc3b692da 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -451,6 +451,8 @@ def test_moment_functions_zero_length_pairwise(f): lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), lambda x: x.expanding(min_periods=5).max(), lambda x: x.expanding(min_periods=5).min(), + lambda x: x.expanding(min_periods=5).first(), + lambda x: x.expanding(min_periods=5).last(), lambda x: x.expanding(min_periods=5).sum(), lambda x: x.expanding(min_periods=5).mean(), lambda x: x.expanding(min_periods=5).std(), @@ -596,6 +598,104 @@ def test_expanding_corr_pairwise_diff_length(): tm.assert_frame_equal(result4, expected) +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [float("nan"), float("nan"), 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [ + float("nan"), + float("nan"), + float("nan"), + float("nan"), + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ], + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [float("nan"), float("nan"), 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [ + float("nan"), + float("nan"), + float("nan"), + float("nan"), + 5.0, + 5.0, + 7.0, + 7.0, + 9.0, + 9.0, + ], + ), + ], +) +def test_expanding_first_last(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.expanding(3), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.expanding(3), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [1.0] * 10, + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [1.0] * 10, + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0, 9.0], + ), + ], +) +def test_expanding_first_last_no_minp(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.expanding(min_periods=0), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.expanding(min_periods=0), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + def test_expanding_apply_args_kwargs(engine_and_raw): def mean_w_arg(x, const): return np.mean(x) + const diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 4d37c6d57f788..392239b8adadd 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -6,6 +6,7 @@ DatetimeIndex, Index, MultiIndex, + NamedAgg, Series, Timestamp, date_range, @@ -90,6 +91,8 @@ def test_getitem_multiple(self, roll_frame): "mean", "min", "max", + "first", + "last", "count", "kurt", "skew", @@ -100,11 +103,7 @@ def test_rolling(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.rolling(4), f)()) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.rolling(4), f)()) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -116,11 +115,7 @@ def test_rolling_ddof(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)(ddof=1) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -134,13 +129,9 @@ def test_rolling_quantile(self, interpolation, roll_frame): r = g.rolling(window=4) result = r.quantile(0.4, interpolation=interpolation) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply( - lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -181,9 +172,7 @@ def test_rolling_corr_cov_other_diff_size_as_groups(self, f, roll_frame): def func(x): return getattr(x.rolling(4), f)(roll_frame) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func) + expected = g.apply(func) # GH 39591: The grouped column should be all np.nan # (groupby.apply inserts 0s for cov) expected["A"] = np.nan @@ -199,9 +188,7 @@ def test_rolling_corr_cov_pairwise(self, f, roll_frame): def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func) + expected = g.apply(func) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -246,11 +233,7 @@ def test_rolling_apply(self, raw, roll_frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -489,6 +472,36 @@ def test_groupby_rolling_subset_with_closed(self): ) tm.assert_series_equal(result, expected) + def test_groupby_rolling_agg_namedagg(self): + # GH#28333 + df = DataFrame( + { + "kind": ["cat", "dog", "cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0, 12.0, 8.0], + "weight": [7.9, 7.5, 9.9, 198.0, 10.0, 42.0], + } + ) + result = ( + df.groupby("kind") + .rolling(2) + .agg( + total_weight=NamedAgg(column="weight", aggfunc=sum), + min_height=NamedAgg(column="height", aggfunc=min), + ) + ) + expected = DataFrame( + { + "total_weight": [np.nan, 17.8, 19.9, np.nan, 205.5, 240.0], + "min_height": [np.nan, 9.1, 9.5, np.nan, 6.0, 8.0], + }, + index=MultiIndex( + [["cat", "dog"], [0, 1, 2, 3, 4, 5]], + [[0, 0, 0, 1, 1, 1], [0, 2, 4, 1, 3, 5]], + names=["kind", None], + ), + ) + tm.assert_frame_equal(result, expected) + def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( @@ -795,13 +808,9 @@ def test_groupby_rolling_resulting_multiindex3(self): def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame): # GH 39732 g = roll_frame.groupby("A", group_keys=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: x.rolling(4).sum()).index + expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(lambda x: x.rolling(4).sum()).index + result = g.apply(lambda x: x.rolling(4).sum()).index tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -977,13 +986,11 @@ def test_groupby_monotonic(self): df["date"] = to_datetime(df["date"]) df = df.sort_values("date") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = ( - df.set_index("date") - .groupby("name") - .apply(lambda x: x.rolling("180D")["amount"].sum()) - ) + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) @@ -1002,13 +1009,9 @@ def test_datelike_on_monotonic_within_each_group(self): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = ( - df.set_index("B") - .groupby("A") - .apply(lambda x: x.rolling("4s")["C"].mean()) - ) + expected = ( + df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) + ) result = df.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) @@ -1031,18 +1034,14 @@ def frame(self): return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) @pytest.mark.parametrize( - "f", ["sum", "mean", "min", "max", "count", "kurt", "skew"] + "f", ["sum", "mean", "min", "max", "first", "last", "count", "kurt", "skew"] ) def test_expanding(self, f, frame): g = frame.groupby("A", group_keys=False) r = g.expanding() result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.expanding(), f)()) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.expanding(), f)()) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1054,11 +1053,7 @@ def test_expanding_ddof(self, f, frame): r = g.expanding() result = getattr(r, f)(ddof=0) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1072,13 +1067,9 @@ def test_expanding_quantile(self, interpolation, frame): r = g.expanding() result = r.quantile(0.4, interpolation=interpolation) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply( - lambda x: x.expanding().quantile(0.4, interpolation=interpolation) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1094,9 +1085,7 @@ def test_expanding_corr_cov(self, f, frame): def func_0(x): return getattr(x.expanding(), f)(frame) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func_0) + expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows # with all nan values null_idx = list(range(20, 61)) + list(range(72, 113)) @@ -1111,9 +1100,7 @@ def func_0(x): def func_1(x): return getattr(x.B.expanding(), f)(pairwise=True) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func_1) + expected = g.apply(func_1) tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw, frame): @@ -1122,18 +1109,42 @@ def test_expanding_apply(self, raw, frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply( - lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index tm.assert_frame_equal(result, expected) + def test_groupby_expanding_agg_namedagg(self): + # GH#28333 + df = DataFrame( + { + "kind": ["cat", "dog", "cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0, 12.0, 8.0], + "weight": [7.9, 7.5, 9.9, 198.0, 10.0, 42.0], + } + ) + result = ( + df.groupby("kind") + .expanding(1) + .agg( + total_weight=NamedAgg(column="weight", aggfunc=sum), + min_height=NamedAgg(column="height", aggfunc=min), + ) + ) + expected = DataFrame( + { + "total_weight": [7.9, 17.8, 27.8, 7.5, 205.5, 247.5], + "min_height": [9.1, 9.1, 9.1, 6.0, 6.0, 6.0], + }, + index=MultiIndex( + [["cat", "dog"], [0, 1, 2, 3, 4, 5]], + [[0, 0, 0, 1, 1, 1], [0, 2, 4, 1, 3, 5]], + names=["kind", None], + ), + ) + tm.assert_frame_equal(result, expected) + class TestEWM: @pytest.mark.parametrize( @@ -1162,6 +1173,41 @@ def test_methods(self, method, expected_data): ) tm.assert_frame_equal(result, expected) + def test_groupby_ewm_agg_namedagg(self): + # GH#28333 + df = DataFrame({"A": ["a"] * 4, "B": range(4)}) + result = ( + df.groupby("A") + .ewm(com=1.0) + .agg( + B_mean=NamedAgg(column="B", aggfunc="mean"), + B_std=NamedAgg(column="B", aggfunc="std"), + B_var=NamedAgg(column="B", aggfunc="var"), + ) + ) + expected = DataFrame( + { + "B_mean": [ + 0.0, + 0.6666666666666666, + 1.4285714285714286, + 2.2666666666666666, + ], + "B_std": [np.nan, 0.707107, 0.963624, 1.177164], + "B_var": [np.nan, 0.5, 0.9285714285714286, 1.3857142857142857], + }, + index=MultiIndex.from_tuples( + [ + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ], + names=["A", None], + ), + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "method, expected_data", [["corr", [np.nan, 1.0, 1.0, 1]], ["cov", [np.nan, 0.5, 0.928571, 1.385714]]], diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index d9ab4723a8f2c..120dbe788a23f 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -459,6 +459,38 @@ def f(x): ) tm.assert_frame_equal(result, expected) + def test_table_method_rolling_apply_col_order(self): + # GH#59666 + def f(x): + return np.nanmean(x[:, 0] - x[:, 1]) + + df = DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [6, 7, 8, 5, 6, 7], + } + ) + result = df.rolling(3, method="table", min_periods=0)[["a", "b"]].apply( + f, raw=True, engine="numba" + ) + expected = DataFrame( + { + "a": [-5, -5, -5, -3.66667, -2.33333, -1], + "b": [-5, -5, -5, -3.66667, -2.33333, -1], + } + ) + tm.assert_almost_equal(result, expected) + result = df.rolling(3, method="table", min_periods=0)[["b", "a"]].apply( + f, raw=True, engine="numba" + ) + expected = DataFrame( + { + "b": [5, 5, 5, 3.66667, 2.33333, 1], + "a": [5, 5, 5, 3.66667, 2.33333, 1], + } + ) + tm.assert_almost_equal(result, expected) + def test_table_method_rolling_weighted_mean(self, step): def weighted_mean(x): arr = np.ones((1, x.shape[1])) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index af3194b5085c4..2aaa35ec5ec2c 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1326,6 +1326,82 @@ def test_rolling_corr_timedelta_index(index, window): tm.assert_almost_equal(result, expected) +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [float("nan"), float("nan"), 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [float("nan")] * 10, + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [float("nan"), float("nan"), 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [float("nan")] * 10, + ), + ], +) +def test_rolling_first_last(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.rolling(3), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.rolling(3), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [1.0, 1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0], + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0, 9.0], + ), + ], +) +def test_rolling_first_last_no_minp(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.rolling(3, min_periods=0), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.rolling(3, min_periods=0), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + def test_groupby_rolling_nan_included(): # GH 35542 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py index f77a98ae9a7d9..6820ab7332975 100644 --- a/pandas/tests/window/test_rolling_functions.py +++ b/pandas/tests/window/test_rolling_functions.py @@ -340,6 +340,8 @@ def test_center_reindex_frame(frame, roll_func, kwargs, minp, fill_value): lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).first(), + lambda x: x.rolling(window=10, min_periods=5).last(), lambda x: x.rolling(window=10, min_periods=5).quantile(q=0.5), lambda x: x.rolling(window=10, min_periods=5).median(), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), @@ -501,6 +503,8 @@ def test_rolling_min_max_numeric_types(any_real_numpy_dtype): lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).first(), + lambda x: x.rolling(window=10, min_periods=5).last(), lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), lambda x: x.rolling(window=10, min_periods=5).median(), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index eacdaddfa28b0..043f369566a5d 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -541,6 +541,42 @@ def test_ragged_max(self, ragged): expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) + def test_ragged_first(self, ragged): + df = ragged + + result = df.rolling(window="1s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 1, 1, 3, 3] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 0, 0, 1, 1] + tm.assert_frame_equal(result, expected) + + def test_ragged_last(self, ragged): + df = ragged + + result = df.rolling(window="1s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "freq, op, result_data", [ @@ -586,6 +622,8 @@ def test_freqs_ops(self, freq, op, result_data): "skew", "min", "max", + "first", + "last", ], ) def test_all(self, f, regular): diff --git a/pandas/tseries/__init__.py b/pandas/tseries/__init__.py index e361726dc6f80..c00843ecac418 100644 --- a/pandas/tseries/__init__.py +++ b/pandas/tseries/__init__.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH004 +# ruff: noqa: TC004 from typing import TYPE_CHECKING if TYPE_CHECKING: diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py index ec2d7d2304839..5ea899f1610a7 100644 --- a/pandas/tseries/api.py +++ b/pandas/tseries/api.py @@ -7,4 +7,4 @@ from pandas.tseries import offsets from pandas.tseries.frequencies import infer_freq -__all__ = ["infer_freq", "offsets", "guess_datetime_format"] +__all__ = ["guess_datetime_format", "infer_freq", "offsets"] diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 534bee5fede44..9a01568971af8 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -89,6 +89,11 @@ def infer_freq( """ Infer the most likely frequency given the input index. + This method attempts to deduce the most probable frequency (e.g., 'D' for daily, + 'H' for hourly) from a sequence of datetime-like objects. It is particularly useful + when the frequency of a time series is not explicitly set or known but can be + inferred from its values. + Parameters ---------- index : DatetimeIndex, TimedeltaIndex, Series or array-like @@ -106,6 +111,13 @@ def infer_freq( ValueError If there are fewer than three values. + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + timedelta_range : Return a fixed frequency TimedeltaIndex with day as the default. + period_range : Return a fixed frequency PeriodIndex. + DatetimeIndex.freq : Return the frequency object if it is set, otherwise None. + Examples -------- >>> idx = pd.date_range(start="2020/12/01", end="2020/12/30", periods=30) diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index b9ef557cb1d15..c95b9e96b1d44 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -634,12 +634,17 @@ def HolidayCalendarFactory(name: str, base, other, base_class=AbstractHolidayCal __all__ = [ + "FR", + "MO", + "SA", + "SU", + "TH", + "TU", + "WE", + "HolidayCalendarFactory", "after_nearest_workday", "before_nearest_workday", - "FR", "get_calendar", - "HolidayCalendarFactory", - "MO", "nearest_workday", "next_monday", "next_monday_or_tuesday", @@ -647,11 +652,6 @@ def HolidayCalendarFactory(name: str, base, other, base_class=AbstractHolidayCal "previous_friday", "previous_workday", "register", - "SA", - "SU", "sunday_to_monday", - "TH", - "TU", - "WE", "weekend_to_monday", ] diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 169c9cc18a7fd..a065137e6971c 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -46,46 +46,46 @@ ) __all__ = [ - "Day", + "FY5253", + "BDay", + "BMonthBegin", + "BMonthEnd", + "BQuarterBegin", + "BQuarterEnd", + "BYearBegin", + "BYearEnd", "BaseOffset", "BusinessDay", + "BusinessHour", "BusinessMonthBegin", "BusinessMonthEnd", - "BDay", + "CBMonthBegin", + "CBMonthEnd", + "CDay", "CustomBusinessDay", + "CustomBusinessHour", "CustomBusinessMonthBegin", "CustomBusinessMonthEnd", - "CDay", - "CBMonthEnd", - "CBMonthBegin", + "DateOffset", + "Day", + "Easter", + "FY5253Quarter", + "Hour", + "LastWeekOfMonth", + "Micro", + "Milli", + "Minute", "MonthBegin", - "BMonthBegin", "MonthEnd", - "BMonthEnd", - "SemiMonthEnd", - "SemiMonthBegin", - "BusinessHour", - "CustomBusinessHour", - "YearBegin", - "BYearBegin", - "YearEnd", - "BYearEnd", + "Nano", "QuarterBegin", - "BQuarterBegin", "QuarterEnd", - "BQuarterEnd", - "LastWeekOfMonth", - "FY5253Quarter", - "FY5253", + "Second", + "SemiMonthBegin", + "SemiMonthEnd", + "Tick", "Week", "WeekOfMonth", - "Easter", - "Tick", - "Hour", - "Minute", - "Second", - "Milli", - "Micro", - "Nano", - "DateOffset", + "YearBegin", + "YearEnd", ] diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 165824bec131f..a1a0d51a7c72b 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -83,7 +83,7 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: if alternative.__doc__.count("\n") < 3: raise AssertionError(doc_error_msg) empty1, summary, empty2, doc_string = alternative.__doc__.split("\n", 3) - if empty1 or empty2 and not summary: + if empty1 or (empty2 and not summary): raise AssertionError(doc_error_msg) wrapper.__doc__ = dedent( f""" @@ -497,13 +497,13 @@ def indent(text: str | None, indents: int = 1) -> str: __all__ = [ "Appender", + "Substitution", "cache_readonly", "deprecate", "deprecate_kwarg", "deprecate_nonkeyword_arguments", "doc", "future_version_msg", - "Substitution", ] diff --git a/pyproject.toml b/pyproject.toml index 6dfee8f4910db..7ab9cd2c17669 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -160,7 +160,13 @@ free-threaded-support = true before-build = "PACKAGE_DIR={package} bash {package}/scripts/cibw_before_build.sh" [tool.cibuildwheel.windows] -before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh" +before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build_windows.sh" +before-test = "bash {package}/scripts/cibw_before_test_windows.sh" +test-command = """ + set PANDAS_CI='1' && \ + python -c "import pandas as pd; \ + pd.test(extra_args=['--no-strict-data-files', '-m not clipboard and not single_cpu and not slow and not network and not db']);" \ + """ repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" [[tool.cibuildwheel.overrides]] @@ -175,13 +181,6 @@ test-command = """ select = "*-musllinux*" before-test = "apk update && apk add musl-locales" -[[tool.cibuildwheel.overrides]] -select = "*-win*" -# We test separately for Windows, since we use -# the windowsservercore docker image to check if any dlls are -# missing from the wheel -test-command = "" - [[tool.cibuildwheel.overrides]] # Don't strip wheels on macOS. # macOS doesn't support stripping wheels with linker @@ -305,10 +304,6 @@ ignore = [ "PERF102", # try-except-in-loop, becomes useless in Python 3.11 "PERF203", - # pytest-missing-fixture-name-underscore - "PT004", - # pytest-incorrect-fixture-name-underscore - "PT005", # pytest-parametrize-names-wrong-type "PT006", # pytest-parametrize-values-wrong-type diff --git a/requirements-dev.txt b/requirements-dev.txt index 00e320e6370ce..fb4d9cdb589ca 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -24,6 +24,7 @@ html5lib>=1.1 hypothesis>=6.84.0 gcsfs>=2022.11.0 ipython +pickleshare jinja2>=3.1.2 lxml>=4.9.2 matplotlib>=3.6.3 @@ -54,7 +55,7 @@ moto flask asv>=0.6.1 flake8==7.1.0 -mypy==1.11.2 +mypy==1.13.0 tokenize-rt pre-commit>=4.0.1 gitpython @@ -62,7 +63,7 @@ gitdb google-auth natsort numpydoc -pydata-sphinx-theme==0.14 +pydata-sphinx-theme==0.16 pytest-cython sphinx sphinx-design diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh index 679b91e3280ec..4cdbf8db0ba89 100644 --- a/scripts/cibw_before_build.sh +++ b/scripts/cibw_before_build.sh @@ -5,8 +5,8 @@ done # TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13. FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" -if [[ $FREE_THREADED_BUILD == "True" ]]; then +if [[ $FREE_THREADED_BUILD == "True" ]]; then python -m pip install -U pip - python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython - python -m pip install ninja meson-python versioneer[toml] + python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython + python -m pip install numpy ninja meson-python versioneer[toml] fi diff --git a/scripts/cibw_before_build_windows.sh b/scripts/cibw_before_build_windows.sh new file mode 100644 index 0000000000000..5153ebd691f3b --- /dev/null +++ b/scripts/cibw_before_build_windows.sh @@ -0,0 +1,13 @@ +# Add 3rd party licenses, like numpy does +for file in $PACKAGE_DIR/LICENSES/*; do + cat $file >> $PACKAGE_DIR/LICENSE +done + +# TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13 +# and a NumPy Windows wheel for the free-threaded build on PyPI. +FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" +if [[ $FREE_THREADED_BUILD == "True" ]]; then + python -m pip install -U pip + python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython + python -m pip install ninja meson-python versioneer[toml] +fi diff --git a/scripts/cibw_before_test_windows.sh b/scripts/cibw_before_test_windows.sh new file mode 100644 index 0000000000000..dd02bc23dd5a1 --- /dev/null +++ b/scripts/cibw_before_test_windows.sh @@ -0,0 +1,5 @@ +# TODO: Delete when there's a NumPy Windows wheel for the free-threaded build on PyPI. +FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" +if [[ $FREE_THREADED_BUILD == "True" ]]; then + python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy +fi diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 55acfaac4d843..944575dcc8659 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -45,6 +45,7 @@ "Styler.template_html_style", "Styler.template_html_table", "Styler.template_latex", + "Styler.template_typst", "Styler.template_string", "Styler.loader", "errors.InvalidComparison", diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 076acc359f933..d804e15f6d48f 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -319,10 +319,10 @@ def nodefault_used_not_only_for_typing(file_obj: IO[str]) -> Iterable[tuple[int, while nodes: in_annotation, node = nodes.pop() if not in_annotation and ( - isinstance(node, ast.Name) # Case `NoDefault` - and node.id == "NoDefault" - or isinstance(node, ast.Attribute) # Cases e.g. `lib.NoDefault` - and node.attr == "NoDefault" + (isinstance(node, ast.Name) # Case `NoDefault` + and node.id == "NoDefault") + or (isinstance(node, ast.Attribute) # Cases e.g. `lib.NoDefault` + and node.attr == "NoDefault") ): yield (node.lineno, "NoDefault is used not only for typing") diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html index 4c66f28818abd..c26b093b0c4ba 100644 --- a/web/pandas/_templates/layout.html +++ b/web/pandas/_templates/layout.html @@ -73,12 +73,12 @@
  • - +
  • - +
  • diff --git a/web/pandas/about/citing.md b/web/pandas/about/citing.md index 4ce1fdb207865..a3c470d05e55f 100644 --- a/web/pandas/about/citing.md +++ b/web/pandas/about/citing.md @@ -20,7 +20,7 @@ following paper: url = {https://doi.org/10.5281/zenodo.3509134} } -- [Data structures for statistical computing in python](https://conference.scipy.org/proceedings/scipy2010/pdfs/mckinney.pdf), +- [Data structures for statistical computing in python](https://pub.curvenote.com/01908378-3686-7168-a380-d82bbf21c799/public/mckinney-57fc0d4e8a08cd7f26a4b8bf468a71f4.pdf), McKinney, Proceedings of the 9th Python in Science Conference, Volume 445, 2010. @InProceedings{ mckinney-proc-scipy-2010, diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 6c69ff7602491..dc7b9bc947214 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -468,6 +468,31 @@ df.dtypes ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/query_builder/). +### [Hugging Face](https://huggingface.co/datasets) + +The Hugging Face Dataset Hub provides a large collection of ready-to-use datasets for machine learning shared by the community. The platform offers a user-friendly interface to explore, discover and visualize datasets, and provides tools to easily load and work with these datasets in Python thanks to the [huggingface_hub](https://github.com/huggingface/huggingface_hub) library. + +You can access datasets on Hugging Face using `hf://` paths in pandas, in the form `hf://datasets/username/dataset_name/...`. + +For example, here is how to load the [stanfordnlp/imdb dataset](https://huggingface.co/datasets/stanfordnlp/imdb): + +```python +import pandas as pd + +# Load the IMDB dataset +df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet") +``` + +Tip: on a dataset page, click on "Use this dataset" to get the code to load it in pandas. + +To save a dataset on Hugging Face you need to [create a public or private dataset](https://huggingface.co/new-dataset) and [login](https://huggingface.co/docs/huggingface_hub/quick-start#login-command), and then you can use `df.to_csv/to_json/to_parquet`: + +```python +# Save the dataset to my Hugging Face account +df.to_parquet("hf://datasets/username/dataset_name/train.parquet") +``` + +You can find more information about the Hugging Face Dataset Hub in the [documentation](https://huggingface.co/docs/hub/en/datasets). ## Out-of-core diff --git a/web/pandas/index.html b/web/pandas/index.html index 63bc11d3ed5d8..98628b856edb6 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -83,8 +83,8 @@

    Follow us

  • - - + +
  • diff --git a/web/pandas/pdeps/0010-required-pyarrow-dependency.md b/web/pandas/pdeps/0010-required-pyarrow-dependency.md index d586c46e243f8..0c3bf3c776988 100644 --- a/web/pandas/pdeps/0010-required-pyarrow-dependency.md +++ b/web/pandas/pdeps/0010-required-pyarrow-dependency.md @@ -185,7 +185,6 @@ Additionally, if a user is installing pandas in an environment where wheels are the user will need to also build Arrow C++ and related dependencies when installing from source. These environments include - Alpine linux (commonly used as a base for Docker containers) -- WASM (pyodide and pyscript) - Python development versions Lastly, pandas development and releases will need to be mindful of PyArrow's development and release cadance. For example when diff --git a/web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md b/web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md new file mode 100644 index 0000000000000..b8eba90f399c9 --- /dev/null +++ b/web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md @@ -0,0 +1,74 @@ +# PDEP-17: Backwards compatibility and deprecation policy + +- Created: 27 June 2024 +- Status: Accepted +- Discussion: [#59125](https://github.com/pandas-dev/pandas/issues/59125) +- Author: [Abdulaziz Aloqeely](https://github.com/Aloqeely) +- Revision: 1 + +## Abstract + +This PDEP defines pandas' backwards compatibility and deprecation policy. + +The main additions to [pandas' current version policy](https://pandas.pydata.org/pandas-docs/version/2.2/development/policies.html) are: +- Deprecated functionality should remain unchanged in at least 2 minor releases before being changed or removed. +- Deprecations should initially use DeprecationWarning, and then be switched to FutureWarning in the last minor release before the major release they are planned to be removed in + +## Motivation + +Having a clear backwards compatibility and deprecation policy is crucial to having a healthy ecosystem. We want to ensure users can rely on pandas being stable while still allowing the library to evolve. + +This policy will ensure that users have enough time to deal with deprecations while also minimizing disruptions on downstream packages' users. + +## Scope + +This PDEP covers pandas' approach to backwards compatibility and the deprecation and removal process. + +## Background + +pandas uses a loose variant of semantic versioning. +A pandas release number is written in the format of ``MAJOR.MINOR.PATCH``. + +## General policy + +This policy applies to the [public API][1]. Anything not part of the [public API][1] or is marked as "Experimental" may be changed or removed at anytime. + +- Breaking backwards compatibility should benefit more than it harms users. +- Breaking changes should go through a deprecation cycle before being implemented if possible. +- Breaking changes should only occur in major releases. +- No deprecations should be introduced in patch releases. +- Deprecated functionality should remain unchanged in at least 2 minor releases before being changed or removed. + +Some bug fixes may require breaking backwards compatibility. In these cases, a deprecation cycle is not necessary. However, bug fixes which have a large impact on users might be treated as a breaking change. Whether or not a change is a bug fix or an API breaking change is a judgement call. + +## Deprecation process + +Deprecation provides a way to warn developers and give them time to adapt their code to the new functionality before the old behavior is eventually removed. + +A deprecation's warning message should: +- Provide information on what is changing. +- Mention how to achieve similar behavior if an alternative is available. +- For large-scale deprecations, it is recommended to include a reason for the deprecation, alongside a discussion link to get user feedback. + +Additionally, when one introduces a deprecation, they should: +- Use the appropriate warning class. More info on this can be found below. +- Add the GitHub issue/PR number as a comment above the warning line. +- Add an entry in the release notes. +- Mention that the functionality is deprecated in the documentation using the ``.. deprecated::`` directive. + +### Which warning class to use + +Deprecations should initially use ``DeprecationWarning``, and then be switched to ``FutureWarning`` for broader visibility in the last minor release before the major release they are planned to be removed in. +This implementation detail can be ignored by using the appropriate ``PandasDeprecationWarning`` variable, which will be aliased to the proper warning class based on the pandas version. + +### Enforcement of deprecations + +When one enforces a deprecation, they should: +- Add an entry in the release notes. +- For API changes, replace the ``.. deprecated::`` directive in the documentation with a ``.. versionchanged::`` directive. + +### PDEP-17 History + +- 27 June 2024: Initial version. + +[1]: https://pandas.pydata.org/docs/reference/index.html